From 897495fbe517c3977d9f717ea2687be6372b3d3d Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 24 May 2024 23:49:57 +0800
Subject: [PATCH 01/71] kernel v0 done

---
 vllm/lora/ops/__init__.py    |   0
 vllm/lora/ops/sgmv_expand.py | 134 +++++++++++++++++++++++++++++++++++
 vllm/lora/ops/sgmv_shrink.py | 134 +++++++++++++++++++++++++++++++++++
 vllm/lora/punica.py          |   3 +-
 4 files changed, 270 insertions(+), 1 deletion(-)
 create mode 100644 vllm/lora/ops/__init__.py
 create mode 100644 vllm/lora/ops/sgmv_expand.py
 create mode 100644 vllm/lora/ops/sgmv_shrink.py

diff --git a/vllm/lora/ops/__init__.py b/vllm/lora/ops/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
new file mode 100644
index 000000000000..b6bcca9fe8d2
--- /dev/null
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -0,0 +1,134 @@
+import triton
+import triton.language as tl
+import torch
+
+
+@triton.jit
+def _sgmv_expand_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    xm_stride,
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+
+    cur_batch = tl.program_id(axis=1)
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = tl.arange(0, BLOCK_K)
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    lora_index = tl.load(lora_indices + cur_batch)
+
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride, )
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < K - k * BLOCK_K,
+                              other=0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < K - k * BLOCK_K,
+                              other=0)
+        accumulator += tl.dot(
+            tiled_a,
+            tiled_b,
+        )
+        a_ptr += BLOCK_K * xk_stride
+        b_ptr += BLOCK_K * lora_n_stride
+    tiled_c = accumulator.to(input_ptr.dtype.element_ty)
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+    M = tl.load(seq_lens + cur_batch)
+    c_mask = (offset_cm[:, None] <
+              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    tl.store(c_ptr, tiled_c, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batchs: int,
+    max_seq_length: int,
+):
+    """_summary_
+
+    Args:
+        inputs (torch.Tensor): _description_
+        lora_b_weights (torch.Tensor): _description_
+        output_tensor (torch.Tensor): _description_
+        b_seq_start_loc (torch.Tensor): _description_
+        seq_len_tensor (torch.Tensor): _description_
+        lora_indices_tensor (torch.Tensor): _description_
+        batchs (int): _description_
+        max_seq_length (int): _description_
+    """
+    _, N, K = lora_b_weights.shape  # K= rank,N=hidden_size
+
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_K = 16
+    EVEN_K = K % BLOCK_K == 0
+
+    grid = [
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        batchs,
+    ]
+
+    _sgmv_expand_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
new file mode 100644
index 000000000000..595c93b89c54
--- /dev/null
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -0,0 +1,134 @@
+import triton
+import triton.language as tl
+import torch
+
+
+@triton.jit
+def _sgmv_shrink_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    xm_stride,  # hidden_size
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    pid_sk = tl.program_id(axis=1)
+    cur_batch = tl.program_id(axis=2)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
+
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    lora_index = tl.load(lora_indices + cur_batch)
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride)
+    b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
+             offset_k[:, None] * lora_n_stride)
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
+        if EVEN_K:
+            a = tl.load(a_ptr)
+            b = tl.load(b_ptr)
+        else:
+            k_remaining = K - k * (BLOCK_K * SPLIT_K)
+            a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0)
+            b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0)
+        accumulator += tl.dot(a, b)
+        a_ptr += BLOCK_K * SPLIT_K * xk_stride
+        b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+
+    c_mask = (offset_cm[:, None] <
+              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    if SPLIT_K == 1:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batchs: int,
+    max_seq_length: int,
+):
+    """_summary_
+
+    Args:
+        inputs (torch.Tensor): _description_
+        lora_a_weights (torch.Tensor): _description_
+        output_tensor (torch.Tensor): _description_
+        b_seq_start_loc (torch.Tensor): _description_
+        seq_len_tensor (torch.Tensor): _description_
+        lora_indices_tensor (torch.Tensor): _description_
+        batchs (int): _description_
+        max_seq_length (int): _description_
+    """
+    _, N, K = lora_a_weights.shape  # K=hidden_size,N=rank
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_K = 32
+    SPLIT_K = 8
+    EVEN_K = K % (SPLIT_K * BLOCK_K) == 0
+
+    grid = [
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        SPLIT_K,
+        batchs,
+    ]
+    _sgmv_shrink_kernel[grid](
+        inputs,
+        lora_a_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_a_weights.stride(0),
+        lora_a_weights.stride(1),
+        lora_a_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+    )
+    return
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index c87bed54726f..8957b6168304 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -1,8 +1,9 @@
 # Based on code from https://github.com/punica-ai/punica
 
 from typing import Optional
-
 import torch
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
 
 def _raise_import_error(e):

From e50234ee32fa89ac41240d1d0dc5255d7dd78482 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Sun, 26 May 2024 00:26:32 +0800
Subject: [PATCH 02/71] add temp_test.py

---
 vllm/lora/ops/temp_test.py | 141 +++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 vllm/lora/ops/temp_test.py

diff --git a/vllm/lora/ops/temp_test.py b/vllm/lora/ops/temp_test.py
new file mode 100644
index 000000000000..79464266883b
--- /dev/null
+++ b/vllm/lora/ops/temp_test.py
@@ -0,0 +1,141 @@
+import torch
+
+import pytest
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+
+
+
+
+def ref_torch_groupgemm(
+    x_ptr,
+    lora_ptr,
+    batchs,
+    lora_indices_tensor,
+    seq_len_tensor,
+) -> torch.Tensor:
+    out_list = []
+
+    current_offset = 0
+    for lora_index, b_length in zip(range(batchs), seq_len_tensor):
+        input_weight = x_ptr[current_offset : b_length + current_offset, :]
+        current_offset += b_length
+        lora_weight = lora_ptr[lora_indices_tensor[lora_index]]
+        result = torch.nn.functional.linear(input_weight, lora_weight)
+        out_list.append(result)
+    out = torch.cat(out_list, dim=0)
+    return out
+
+
+@pytest.mark.parametrize("batchs", [i for i in range(0, 128, 8)])
+@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 4096, 8192, 3424])
+@pytest.mark.parametrize("lora_nums", [4, 8, 16, 32, 64, 128])
+@pytest.mark.parametrize("max_rank", [1, 8, 16, 32, 64, 128])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16,torch.float32])
+@torch.inference_mode()
+def test_shrink_kernel(batchs, hidden_size, lora_nums, max_rank, dtype):
+    SEED = [0xABCDABCD987]
+    torch.manual_seed(SEED[0])
+    if batchs == 0:
+        batchs += 1
+
+    seq_len_tensor = torch.randint(1, 1024, (batchs,)).cuda()
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).cuda()
+    total_tokens = seq_len_tensor.sum()
+
+    inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).cuda()
+    lora_a_weights = torch.rand(
+        (lora_nums, max_rank, hidden_size),  # col-major
+        dtype=dtype,
+    ).cuda()
+
+    lora_indices_tensor = torch.randint(0, lora_nums - 1, (batchs,)).cuda()
+    output_tensor = torch.zeros(
+        total_tokens, max_rank, dtype=torch.float32
+    ).cuda()
+
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+
+    sgmv_shrink(
+        inputs_tensor,
+        lora_a_weights,
+        output_tensor,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        batchs,
+        max_seq_length,
+    )
+    torch.cuda.synchronize()
+    torch_out_tensor = ref_torch_groupgemm(
+        inputs_tensor,
+        lora_a_weights,
+        batchs,
+        lora_indices_tensor,
+        seq_len_tensor,
+    )
+    torch_out_tensor = torch_out_tensor.to(torch.float32)
+    assert torch.allclose(torch_out_tensor, output_tensor, atol=1e-2, rtol=1e-2)
+
+@pytest.mark.parametrize("batchs", [i for i in range(0, 128, 8)])
+@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 4096, 8192, 3424])
+@pytest.mark.parametrize("lora_nums", [4, 8, 16, 32, 64, 128])
+@pytest.mark.parametrize("max_rank", [1, 8, 16, 32, 64, 128])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16,torch.float32])
+@torch.inference_mode()
+def test_expand_kernel(batchs, hidden_size, lora_nums, max_rank, dtype):
+    SEED = [0xABCDABCD987]
+    torch.manual_seed(SEED[0])
+    if batchs == 0:
+        batchs += 1
+
+    seq_len_tensor = torch.randint(1, 1024, (batchs,)).cuda()
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).cuda()
+    total_tokens = seq_len_tensor.sum()
+
+    inputs_tensor = torch.rand((total_tokens, max_rank), dtype=dtype).cuda()
+    lora_b_weights = torch.rand(
+        (lora_nums,hidden_size, max_rank),  # col-major
+        dtype=dtype,
+    ).cuda()
+
+    lora_indices_tensor = torch.randint(0, lora_nums - 1, (batchs,)).cuda()
+    output_tensor = torch.zeros(
+        total_tokens, hidden_size, dtype=dtype
+    ).cuda()
+
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+
+    sgmv_expand(
+        inputs_tensor,
+        lora_b_weights,
+        output_tensor,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        batchs,
+        max_seq_length,
+    )
+    torch.cuda.synchronize()
+    torch_out_tensor = ref_torch_groupgemm(
+        inputs_tensor,
+        lora_b_weights,
+        batchs,
+        lora_indices_tensor,
+        seq_len_tensor,
+    )
+    assert torch.allclose(torch_out_tensor, output_tensor, atol=1e-2, rtol=1e-2)

From cdfa7c6ebda708a59a6aa7f30af8cd842f77cab9 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 27 May 2024 09:58:16 +0800
Subject: [PATCH 03/71] add unit test

---
 tests/lora/test_triton_sgmv.py | 326 +++++++++++++++++++++++++++++++++
 vllm/lora/ops/sgmv_expand.py   |  45 +++--
 vllm/lora/ops/sgmv_shrink.py   |  52 ++++--
 vllm/lora/ops/temp_test.py     | 141 --------------
 vllm/lora/punica.py            |   6 +-
 5 files changed, 402 insertions(+), 168 deletions(-)
 create mode 100644 tests/lora/test_triton_sgmv.py
 delete mode 100644 vllm/lora/ops/temp_test.py

diff --git a/tests/lora/test_triton_sgmv.py b/tests/lora/test_triton_sgmv.py
new file mode 100644
index 000000000000..5cbd40f210fb
--- /dev/null
+++ b/tests/lora/test_triton_sgmv.py
@@ -0,0 +1,326 @@
+import random
+
+import pytest
+import torch
+
+import vllm.lora.punica as punica
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+
+#The current punica kernel supports dimension and adds a dimension of 3424.
+HIDDEN_SIZES = [
+    128,
+    256,
+    512,
+    1024,
+    1152,
+    1280,
+    1536,
+    2048,
+    2304,
+    2560,
+    2752,
+    3072,
+    3424,
+    3456,
+    3584,
+    4096,
+    4608,
+    5120,
+    5504,
+    5632,
+    6144,
+    6848,
+    6912,
+    7168,
+    8192,
+    9216,
+    10240,
+    11008,
+    13824,
+    14336,
+    15360,
+    22016,
+    24576,
+    27392,
+    27648,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+BATCHS = [i for i in range(0, 64, 8)]
+NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256]
+DTYPES = [torch.half, torch.bfloat16, torch.float32]
+MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
+SCALES = [0.5]
+OP_TYPES = ["shrink", "expand"]
+SEED = [0]
+CUDA_DEVICES = [f"cuda:{0}"]
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (1e-2, 1e-2),
+        torch.bfloat16: (12e-2, 1e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+@torch.inference_mode()
+def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling):
+    layer_idx = 0
+    punica.bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, scaling)
+    return
+
+
+def _torch_groupgemm(
+    out_tensor,
+    inputs,
+    lora_weights,
+    lora_indices_tensor,
+    seq_len_tensor,
+    batchs,
+    scaling,
+) -> torch.Tensor:
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batchs), seq_len_tensor):
+        input_weight = inputs[current_offset:b_length + current_offset, :]
+        current_offset += b_length
+        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
+        result = torch.nn.functional.linear(input_weight, lora_weight)
+        result *= scaling
+        out_list.append(result)
+    out_tensor.copy_(torch.cat(out_list, dim=0))
+    return
+
+
+def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
+                   op_type, device):
+    seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    if op_type == "shrink":
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, max_rank, hidden_size),  # col-major
+            dtype=dtype,
+        ).to(device)
+        ref_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=dtype,
+                                     device=inputs_tensor.device)
+        # NOTE  shrink kernel using torch.float32 as output type
+        our_out_tensor = torch.zeros(
+            (total_tokens, max_rank),
+            dtype=torch.float32,
+            device=inputs_tensor.device,
+        )
+    else:
+        inputs_tensor = torch.rand(
+            (total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, hidden_size, max_rank),  # col-major
+            dtype=dtype,
+        ).to(device)
+        ref_out_tensor = torch.zeros(
+            (total_tokens, hidden_size),
+            dtype=dtype,
+            device=inputs_tensor.device,
+        )
+        our_out_tensor = torch.zeros(
+            (total_tokens, hidden_size),
+            dtype=dtype,
+            device=inputs_tensor.device,
+        )
+
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batchs, )).to(device)
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batchs):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = lora_index.item()
+        current_offset += seq_len_tensor[b_id].item()
+    return (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+@pytest.mark.parametrize("batchs", BATCHS)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", OP_TYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_triton_sgmv(
+    batchs: int,
+    num_loras: int,
+    rank: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    torch.manual_seed(seed)
+    if batchs == 0:
+        batchs += 1
+    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
+    hidden_size = HIDDEN_SIZES[hidden_size_index]
+    if hidden_size > 100000:
+        hidden_size = hidden_size // 4  # avoid OOM
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = _generate_data(
+        batchs, hidden_size, num_loras, rank, 1024, dtype, op_type,
+        device)  # The sequence length is restricted to the range [1, 1024].
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+        )
+    _torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batchs,
+        scaling if op_type == "shrink" else 1.0,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", OP_TYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sgmv_punica_bgmv(
+    hidden_size,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
+    if dtype == torch.float32 or hidden_size == 3424:
+        return
+    torch.manual_seed(seed)
+    batchs = 4  # Arbitrary values for testing
+    rank = 16
+    seq_len = 333  # Arbitrary values for testing
+    num_loras = 8  # Arbitrary values for testing
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+                       op_type, device)
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+        )
+    lora_weights_4d = lora_weights.unsqueeze(dim=1)
+    _punica_bgmv(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights_4d,
+        indices,
+        scaling if op_type == "shrink" else 1.0,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index b6bcca9fe8d2..354778926250 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -1,6 +1,6 @@
+import torch
 import triton
 import triton.language as tl
-import torch
 
 
 @triton.jit
@@ -25,22 +25,27 @@ def _sgmv_expand_kernel(
     BLOCK_K: tl.constexpr,
     EVEN_K: tl.constexpr,
 ):
+    """
+    The sgmv's expand triton kernel is based on GroupGEMM.
+    The GEMM of Multi-LoRA can be considered as GroupGEMM.
+    """
     pid = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
     cta_n_num = tl.cdiv(N, BLOCK_N)
     pid_m = pid // cta_n_num
     pid_n = pid % cta_n_num
-
-    cur_batch = tl.program_id(axis=1)
     M = tl.load(seq_lens + cur_batch)
     if pid_m * BLOCK_M > M:
         return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
     cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
     offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     offset_k = tl.arange(0, BLOCK_K)
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-    lora_index = tl.load(lora_indices + cur_batch)
 
     a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
              offset_k[None, :] * xk_stride, )
@@ -89,17 +94,31 @@ def sgmv_expand(
     """_summary_
 
     Args:
-        inputs (torch.Tensor): _description_
-        lora_b_weights (torch.Tensor): _description_
-        output_tensor (torch.Tensor): _description_
-        b_seq_start_loc (torch.Tensor): _description_
-        seq_len_tensor (torch.Tensor): _description_
-        lora_indices_tensor (torch.Tensor): _description_
-        batchs (int): _description_
-        max_seq_length (int): _description_
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4, 10].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batchs (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
     """
+    assert inputs.dtype == lora_b_weights.dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batchs
+    assert lora_indices_tensor.size(0) == batchs
+    assert inputs.is_contiguous()
+    assert lora_b_weights.is_contiguous()
+    assert output_tensor.is_contiguous()
+    # TODO tuning this config
     _, N, K = lora_b_weights.shape  # K= rank,N=hidden_size
-
     BLOCK_M = 32
     BLOCK_N = 32
     BLOCK_K = 16
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 595c93b89c54..d3858d91791e 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -1,6 +1,6 @@
+import torch
 import triton
 import triton.language as tl
-import torch
 
 
 @triton.jit
@@ -13,6 +13,7 @@ def _sgmv_shrink_kernel(
     b_seq_start_loc,
     seq_lens,
     lora_indices,
+    scaling,
     xm_stride,  # hidden_size
     xk_stride,  # 1
     l0_stride,  # hidden_size*max_rank
@@ -26,6 +27,11 @@ def _sgmv_shrink_kernel(
     EVEN_K: tl.constexpr,
     SPLIT_K: tl.constexpr,
 ):
+    """
+    The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K. 
+    The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally, 
+    introducing SPLIT-K can improve performance
+    """
     pid = tl.program_id(axis=0)
     pid_sk = tl.program_id(axis=1)
     cur_batch = tl.program_id(axis=2)
@@ -36,6 +42,9 @@ def _sgmv_shrink_kernel(
     M = tl.load(seq_lens + cur_batch)
     if pid_m * BLOCK_M > M:
         return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
     cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
     offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
@@ -43,12 +52,11 @@ def _sgmv_shrink_kernel(
 
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-    lora_index = tl.load(lora_indices + cur_batch)
+
     a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
              offset_k[None, :] * xk_stride)
     b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
              offset_k[:, None] * lora_n_stride)
-
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
         if EVEN_K:
@@ -68,6 +76,8 @@ def _sgmv_shrink_kernel(
 
     c_mask = (offset_cm[:, None] <
               (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    accumulator *= scaling
+    # handles write-back with reduction-splitting
     if SPLIT_K == 1:
         tl.store(c_ptr, accumulator, mask=c_mask)
     else:
@@ -84,19 +94,36 @@ def sgmv_shrink(
     lora_indices_tensor: torch.Tensor,
     batchs: int,
     max_seq_length: int,
+    scaling: float,
 ):
-    """_summary_
+    """
 
     Args:
-        inputs (torch.Tensor): _description_
-        lora_a_weights (torch.Tensor): _description_
-        output_tensor (torch.Tensor): _description_
-        b_seq_start_loc (torch.Tensor): _description_
-        seq_len_tensor (torch.Tensor): _description_
-        lora_indices_tensor (torch.Tensor): _description_
-        batchs (int): _description_
-        max_seq_length (int): _description_
+        inputs (torch.Tensor): input tensor
+        lora_a_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4, 10].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batchs (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        scaling (float):  Scaling factor.
     """
+    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batchs
+    assert lora_indices_tensor.size(0) == batchs
+    assert inputs.is_contiguous()
+    assert lora_a_weights.is_contiguous()
+    assert output_tensor.is_contiguous()
+    # TODO tuning this config
     _, N, K = lora_a_weights.shape  # K=hidden_size,N=rank
     BLOCK_M = 32
     BLOCK_N = 32
@@ -118,6 +145,7 @@ def sgmv_shrink(
         b_seq_start_loc,
         seq_len_tensor,
         lora_indices_tensor,
+        scaling,
         inputs.stride(0),
         inputs.stride(1),
         lora_a_weights.stride(0),
diff --git a/vllm/lora/ops/temp_test.py b/vllm/lora/ops/temp_test.py
deleted file mode 100644
index 79464266883b..000000000000
--- a/vllm/lora/ops/temp_test.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import torch
-
-import pytest
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-
-
-
-
-def ref_torch_groupgemm(
-    x_ptr,
-    lora_ptr,
-    batchs,
-    lora_indices_tensor,
-    seq_len_tensor,
-) -> torch.Tensor:
-    out_list = []
-
-    current_offset = 0
-    for lora_index, b_length in zip(range(batchs), seq_len_tensor):
-        input_weight = x_ptr[current_offset : b_length + current_offset, :]
-        current_offset += b_length
-        lora_weight = lora_ptr[lora_indices_tensor[lora_index]]
-        result = torch.nn.functional.linear(input_weight, lora_weight)
-        out_list.append(result)
-    out = torch.cat(out_list, dim=0)
-    return out
-
-
-@pytest.mark.parametrize("batchs", [i for i in range(0, 128, 8)])
-@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 4096, 8192, 3424])
-@pytest.mark.parametrize("lora_nums", [4, 8, 16, 32, 64, 128])
-@pytest.mark.parametrize("max_rank", [1, 8, 16, 32, 64, 128])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16,torch.float32])
-@torch.inference_mode()
-def test_shrink_kernel(batchs, hidden_size, lora_nums, max_rank, dtype):
-    SEED = [0xABCDABCD987]
-    torch.manual_seed(SEED[0])
-    if batchs == 0:
-        batchs += 1
-
-    seq_len_tensor = torch.randint(1, 1024, (batchs,)).cuda()
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-        dim=0,
-    ).cuda()
-    total_tokens = seq_len_tensor.sum()
-
-    inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).cuda()
-    lora_a_weights = torch.rand(
-        (lora_nums, max_rank, hidden_size),  # col-major
-        dtype=dtype,
-    ).cuda()
-
-    lora_indices_tensor = torch.randint(0, lora_nums - 1, (batchs,)).cuda()
-    output_tensor = torch.zeros(
-        total_tokens, max_rank, dtype=torch.float32
-    ).cuda()
-
-    max_seq_length = seq_len_tensor.max()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-
-    sgmv_shrink(
-        inputs_tensor,
-        lora_a_weights,
-        output_tensor,
-        b_seq_start_loc,
-        seq_len_tensor,
-        lora_indices_tensor,
-        batchs,
-        max_seq_length,
-    )
-    torch.cuda.synchronize()
-    torch_out_tensor = ref_torch_groupgemm(
-        inputs_tensor,
-        lora_a_weights,
-        batchs,
-        lora_indices_tensor,
-        seq_len_tensor,
-    )
-    torch_out_tensor = torch_out_tensor.to(torch.float32)
-    assert torch.allclose(torch_out_tensor, output_tensor, atol=1e-2, rtol=1e-2)
-
-@pytest.mark.parametrize("batchs", [i for i in range(0, 128, 8)])
-@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 4096, 8192, 3424])
-@pytest.mark.parametrize("lora_nums", [4, 8, 16, 32, 64, 128])
-@pytest.mark.parametrize("max_rank", [1, 8, 16, 32, 64, 128])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16,torch.float32])
-@torch.inference_mode()
-def test_expand_kernel(batchs, hidden_size, lora_nums, max_rank, dtype):
-    SEED = [0xABCDABCD987]
-    torch.manual_seed(SEED[0])
-    if batchs == 0:
-        batchs += 1
-
-    seq_len_tensor = torch.randint(1, 1024, (batchs,)).cuda()
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-        dim=0,
-    ).cuda()
-    total_tokens = seq_len_tensor.sum()
-
-    inputs_tensor = torch.rand((total_tokens, max_rank), dtype=dtype).cuda()
-    lora_b_weights = torch.rand(
-        (lora_nums,hidden_size, max_rank),  # col-major
-        dtype=dtype,
-    ).cuda()
-
-    lora_indices_tensor = torch.randint(0, lora_nums - 1, (batchs,)).cuda()
-    output_tensor = torch.zeros(
-        total_tokens, hidden_size, dtype=dtype
-    ).cuda()
-
-    max_seq_length = seq_len_tensor.max()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-
-    sgmv_expand(
-        inputs_tensor,
-        lora_b_weights,
-        output_tensor,
-        b_seq_start_loc,
-        seq_len_tensor,
-        lora_indices_tensor,
-        batchs,
-        max_seq_length,
-    )
-    torch.cuda.synchronize()
-    torch_out_tensor = ref_torch_groupgemm(
-        inputs_tensor,
-        lora_b_weights,
-        batchs,
-        lora_indices_tensor,
-        seq_len_tensor,
-    )
-    assert torch.allclose(torch_out_tensor, output_tensor, atol=1e-2, rtol=1e-2)
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 8957b6168304..1e6cb83f719d 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -1,9 +1,11 @@
 # Based on code from https://github.com/punica-ai/punica
 
 from typing import Optional
+
 import torch
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+
+# from vllm.lora.ops.sgmv_expand import sgmv_expand
+# from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
 
 def _raise_import_error(e):

From 2fbb2ca49f97caaeaa3fbeb819d0df3fc43ba749 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 28 May 2024 22:18:08 +0800
Subject: [PATCH 04/71] back up

---
 tests/lora/test_triton_sgmv.py     |  61 +++---
 vllm/lora/layers.py                | 306 ++++++++++++++++++++++++++---
 vllm/lora/models.py                |  21 +-
 vllm/lora/ops/sgmv_expand.py       |  46 ++++-
 vllm/lora/ops/sgmv_expand_slice.py | 207 +++++++++++++++++++
 vllm/lora/ops/sgmv_shrink.py       |  25 ++-
 vllm/lora/punica.py                | 193 +++++++++++++++---
 vllm/worker/model_runner.py        |  37 +++-
 8 files changed, 797 insertions(+), 99 deletions(-)
 create mode 100644 vllm/lora/ops/sgmv_expand_slice.py

diff --git a/tests/lora/test_triton_sgmv.py b/tests/lora/test_triton_sgmv.py
index 5cbd40f210fb..d0903f76cd37 100644
--- a/tests/lora/test_triton_sgmv.py
+++ b/tests/lora/test_triton_sgmv.py
@@ -7,7 +7,7 @@
 from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
-#The current punica kernel supports dimension and adds a dimension of 3424.
+# The current punica kernel supports dimension and adds a dimension of 3424.
 HIDDEN_SIZES = [
     128,
     256,
@@ -93,6 +93,7 @@ def _torch_groupgemm(
     seq_len_tensor,
     batchs,
     scaling,
+    op_type,
 ) -> torch.Tensor:
     out_list = []
     current_offset = 0
@@ -103,7 +104,11 @@ def _torch_groupgemm(
         result = torch.nn.functional.linear(input_weight, lora_weight)
         result *= scaling
         out_list.append(result)
-    out_tensor.copy_(torch.cat(out_list, dim=0))
+    cat_result = torch.cat(out_list, dim=0)
+    if op_type == "expand":
+        out_tensor += cat_result
+    else:
+        out_tensor.copy_(cat_result)
     return
 
 
@@ -122,6 +127,7 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
             (lora_nums, max_rank, hidden_size),  # col-major
             dtype=dtype,
         ).to(device)
+        # shrink op need atomic_add, so output is initinized by 0
         ref_out_tensor = torch.zeros((total_tokens, max_rank),
                                      dtype=dtype,
                                      device=inputs_tensor.device)
@@ -132,6 +138,7 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
             device=inputs_tensor.device,
         )
     else:
+
         inputs_tensor = torch.rand(
             (total_tokens, max_rank),
             dtype=dtype,
@@ -140,16 +147,15 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
             (lora_nums, hidden_size, max_rank),  # col-major
             dtype=dtype,
         ).to(device)
-        ref_out_tensor = torch.zeros(
-            (total_tokens, hidden_size),
-            dtype=dtype,
-            device=inputs_tensor.device,
-        )
-        our_out_tensor = torch.zeros(
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        ref_out_tensor = torch.rand(
             (total_tokens, hidden_size),
             dtype=dtype,
             device=inputs_tensor.device,
         )
+        # Ensure the same input.
+        our_out_tensor = ref_out_tensor.clone()
 
     lora_indices_tensor = torch.randint(0,
                                         lora_nums - 1 if lora_nums > 1 else 1,
@@ -181,7 +187,7 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
 @pytest.mark.parametrize("op_type", OP_TYPES)
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_triton_sgmv(
+def test_sgmv_torch(
     batchs: int,
     num_loras: int,
     rank: int,
@@ -228,25 +234,18 @@ def test_triton_sgmv(
             scaling,
         )
     else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batchs,
-            max_seq_length,
-        )
-    _torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batchs,
-        scaling if op_type == "shrink" else 1.0,
-    )
+        sgmv_expand(inputs_tensor,
+                    lora_weights,
+                    our_out_tensor,
+                    b_seq_start_loc,
+                    seq_len_tensor,
+                    lora_indices_tensor,
+                    batchs,
+                    max_seq_length,
+                    add_inputs=True)
+    _torch_groupgemm(ref_out_tensor, inputs_tensor, lora_weights,
+                     lora_indices_tensor, seq_len_tensor, batchs,
+                     scaling if op_type == "shrink" else 1.0, op_type)
     if op_type == "shrink":
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
@@ -285,6 +284,7 @@ def test_sgmv_punica_bgmv(
         indices,
     ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
                        op_type, device)
+
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
         max_seq_length = max_seq_length[0].item()
@@ -312,6 +312,7 @@ def test_sgmv_punica_bgmv(
             lora_indices_tensor,
             batchs,
             max_seq_length,
+            add_inputs=True,
         )
     lora_weights_4d = lora_weights.unsqueeze(dim=1)
     _punica_bgmv(
@@ -324,3 +325,7 @@ def test_sgmv_punica_bgmv(
     if op_type == "shrink":
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
+
+
+# if __name__ == "__main__":
+#     pytest.main(["test_triton_sgmv.py::test_sgmv_torch"])
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 24b74476c3b8..5e4f648f3788 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,6 +1,6 @@
 # pylint: disable=unused-argument
 import math
-from dataclasses import dataclass
+from dataclasses import dataclass,field
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -16,7 +16,9 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-from vllm.lora.punica import add_lora, add_lora_slice, bgmv
+from vllm.lora.punica import (add_lora, add_lora_triton, add_lora_slice,
+                              add_lora_triton_slice, bgmv)
+from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -88,8 +90,47 @@ def _apply_lora(
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
     indices = indices.view(-1)
-    add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0)
-    return output.view_as(org_output)
+    buffer = add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0,
+                      1.0)
+    return buffer, output.view_as(org_output)
+
+
+def _apply_lora_triton(
+    x: torch.Tensor,
+    lora_a_stacked: torch.Tensor,
+    lora_b_stacked: torch.Tensor,
+    b_seq_start_tensor: torch.Tensor,
+    seq_length_tensor: torch.Tensor,
+    lora_index_tensor: torch.Tensor,
+    batch_mlength_lst: List[int],
+    output: torch.Tensor,
+):
+    # """Applies lora to each input.
+
+    # This method applies all loras to each input. It uses the
+    # indices vector to determine which lora yields the
+    # correct output. An index of -1 means no lora should be
+    # applied. This method adds the final lora results to the
+    # output.
+
+    # Input shapes:
+    #     x:               (batch_size, hidden_dim)
+    #     lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
+    #     lora_b_stacked:  (num_loras, output_dim, lora_rank)
+    #     indices:         (batch_size)
+    #     output:          (batch_size, output_dim)
+    # """
+    org_output = output
+    x = x.view(-1, x.shape[-1])
+    output = output.view(-1, output.shape[-1])
+
+    batch_size = batch_mlength_lst[0]
+    max_length = batch_mlength_lst[1]
+
+    buffer = add_lora_triton(output, x, lora_a_stacked, lora_b_stacked,
+                             b_seq_start_tensor, seq_length_tensor,
+                             lora_index_tensor, batch_size, max_length, 0, 1.0)
+    return buffer, output.view_as(org_output)
 
 
 def _apply_lora_packed_nslice(
@@ -133,12 +174,64 @@ def _apply_lora_packed_nslice(
     return output.view_as(org_output)
 
 
+def _apply_lora_triton_nslice(
+    x: torch.Tensor,
+    lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+    lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+    b_seq_start_tensor: torch.Tensor,
+    seq_length_tensor: torch.Tensor,
+    lora_index_tensor: torch.Tensor,
+    batch_mlength_lst: List[int],
+    output: torch.Tensor,
+    output_slices: Tuple[int, ...],
+):
+    # """Applies lora to each input.
+
+    # This method applies all loras to each input. It uses the
+    # indices vector to determine which lora yields the
+    # correct output. An index of -1 means no lora should be
+    # applied. This method adds the final lora results to the
+    # output.
+
+    # Input shapes:
+    #     x:               (batch_size, hidden_dim)
+    #     lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
+    #     lora_b_stacked:  (num_loras, output_dim, lora_rank)
+    #     indices:         (batch_size)
+    #     output:          (batch_size, output_dim)
+    # """
+    org_output = output
+    x = x.view(-1, x.shape[-1])
+    output = output.view(-1, output.shape[-1])
+
+    batch_size = batch_mlength_lst[0]
+    max_length = batch_mlength_lst[1]
+
+    offset_left = 0
+    #TODO fuse these kernel
+    for slice_idx in range(len(output_slices)):
+        add_lora_triton_slice(output, x, lora_a_stacked[slice_idx],
+                              lora_b_stacked[slice_idx], b_seq_start_tensor,
+                              seq_length_tensor, lora_index_tensor, batch_size,
+                              max_length, 0, 1.0, offset_left,
+                              output_slices[slice_idx])
+        offset_left += output_slices[slice_idx]
+
+    return output.view_as(org_output)
+
+
 @dataclass
 class LoRAMapping:
     # Per every token in input_ids:
     index_mapping: Tuple[int, ...]
     # Per sampled token:
     prompt_mapping: Tuple[int, ...]
+    # Per batch lora index
+    batch_mapping: List[int]=field(default_factory=list)
+    # Per batch seq length
+    seq_lens: List[int]=field(default_factory=list)
+    # prefilling or  decoding.
+    is_prefilling: bool=False
 
     def __post_init__(self):
         self.index_mapping = tuple(self.index_mapping)
@@ -193,6 +286,13 @@ def set_mapping(
         """Sets the mapping indices."""
         ...
 
+    def set_kernel_mapping(self, seq_length_tensor: torch.Tensor,
+                           b_seq_start_tensor: torch.Tensor,
+                           lora_index_tensor: torch.Tensor,
+                           batch_mlength_lst: List[int]):
+        """Sets the kernel mapping"""
+        ...
+
     @classmethod
     def can_replace_layer(cls, source_layer: nn.Module,
                           lora_config: LoRAConfig, packed_modules_list: List,
@@ -270,6 +370,11 @@ def create_lora_weights(
         self.indices_len: List[int]
         self.embeddings_indices: torch.Tensor
 
+        self.seq_length_tensor: torch.Tensor
+        self.b_seq_start_tensor: torch.Tensor
+        self.lora_index_tensor: torch.Tensor
+        self.batch_mlength_list: List[int]
+
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -316,6 +421,18 @@ def set_mapping(
         self.embeddings_indices = embeddings_indices
         self.indices_len = indices_len
 
+    def set_kernel_mapping(
+        self,
+        seq_length_tensor: torch.Tensor,
+        b_seq_start_tensor: torch.Tensor,
+        lora_index_tensor: torch.Tensor,
+        batch_mlength_lst: List[int],
+    ):
+        self.seq_length_tensor = seq_length_tensor
+        self.b_seq_start_tensor = b_seq_start_tensor
+        self.lora_index_tensor = lora_index_tensor
+        self.batch_mlength_list = batch_mlength_lst
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
         embedding_len = self.indices_len[3]
@@ -336,8 +453,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
                 full_lora_a_embeddings.shape[0] *
                 full_lora_a_embeddings.shape[1], -1)
-        bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        batchs, max_length = self.batch_mlength_list[
+            0], self.batch_mlength_list[1]
+
+        sgmv_expand(
+            full_lora_a_embeddings,
+            self.lora_b_stacked,
+            full_output,
+            self.b_seq_start_tensor[:batchs],
+            self.seq_length_tensor[:batchs],
+            self.lora_index_tensor[:batchs],
+            batchs,
+            max_length,
+            True,
+        )
         return full_output.view_as(full_output_org)
 
     @classmethod
@@ -393,6 +522,10 @@ def create_lora_weights(
         # lazily initialized.
         self.indices: torch.Tensor
         self.indices_len: List[int]
+        self.seq_length_tensor: torch.Tensor
+        self.b_seq_start_tensor: torch.Tensor
+        self.lora_index_tensor: torch.Tensor
+        self.batch_mlength_list: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -441,16 +574,28 @@ def set_mapping(
         self.indices = base_indices
         self.indices_len = indices_len
 
+    def set_kernel_mapping(
+        self,
+        seq_length_tensor: torch.Tensor,
+        b_seq_start_tensor: torch.Tensor,
+        lora_index_tensor: torch.Tensor,
+        batch_mlength_lst: List[int],
+    ):
+        self.seq_length_tensor = seq_length_tensor
+        self.b_seq_start_tensor = b_seq_start_tensor
+        self.lora_index_tensor = lora_index_tensor
+        self.batch_mlength_list = batch_mlength_lst
+
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-        )
+        batch_size = self.batch_mlength_list[0]
+        # maybe we need not  restrict  range to [:batch_size]
+        _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
+                           self.b_seq_start_tensor[:batch_size],
+                           self.seq_length_tensor[:batch_size],
+                           self.lora_index_tensor[:batch_size],
+                           self.batch_mlength_list, output)
         return output
 
     def forward(self, input_):
@@ -542,6 +687,11 @@ def create_lora_weights(
         # Lazily initialized.
         self.indices: torch.Tensor
 
+        self.seq_length_tensor: torch.Tensor
+        self.b_seq_start_tensor: torch.Tensor
+        self.lora_index_tensor: torch.Tensor
+        self.batch_mlength_list: List[int]
+
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
         self.lora_a_stacked[1][index] = 0
@@ -597,14 +747,32 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_packed_nslice(
+        # output_temp=output.clone()
+        # _apply_lora_packed_nslice(
+        #     x,
+        #     self.lora_a_stacked,
+        #     self.lora_b_stacked,
+        #     self.indices[:self.indices_len[0]],
+        #     output,
+        #     (self.output_dim, self.output_dim),
+        # )
+        batchs = self.batch_mlength_list[0]
+        _apply_lora_triton_nslice(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
+            self.b_seq_start_tensor[:batchs],
+            self.seq_length_tensor[:batchs],
+            self.lora_index_tensor[:batchs],
+            self.batch_mlength_list,
             output,
             (self.output_dim, self.output_dim),
         )
+        # flag=torch.allclose(output,output_temp,1e-2,1e-2)
+        # if flag:
+        #     print("pass")
+        # else:
+        #     print()
         return output
 
     @classmethod
@@ -774,6 +942,11 @@ def create_lora_weights(
         # lazily initialized.
         self.indices_len: List[int]
 
+        self.seq_length_tensor: torch.Tensor
+        self.b_seq_start_tensor: torch.Tensor
+        self.lora_index_tensor: torch.Tensor
+        self.batch_mlength_list: List[int]
+
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
         self.lora_b_stacked[0][index] = 0
@@ -851,14 +1024,27 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_packed_nslice(
+        # _apply_lora_packed_nslice(
+        #     x,
+        #     self.lora_a_stacked,
+        #     self.lora_b_stacked,
+        #     self.indices[:self.indices_len[0]],
+        #     output,
+        #     self.output_slices,
+        # )
+        batchs = self.batch_mlength_list[0]
+        _apply_lora_triton_nslice(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
+            self.b_seq_start_tensor[:batchs],
+            self.seq_length_tensor[:batchs],
+            self.lora_index_tensor[:batchs],
+            self.batch_mlength_list,
             output,
             self.output_slices,
         )
+
         return output
 
     @classmethod
@@ -915,6 +1101,11 @@ def create_lora_weights(
         self.indices: torch.Tensor
         self.indices_len: List[int]
 
+        self.seq_length_tensor: torch.Tensor
+        self.b_seq_start_tensor: torch.Tensor
+        self.lora_index_tensor: torch.Tensor
+        self.batch_mlength_list: List[int]
+
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -962,17 +1153,56 @@ def set_mapping(
         self.indices = base_indices
         self.indices_len = indices_len
 
+    def set_kernel_mapping(
+        self,
+        seq_length_tensor: torch.Tensor,
+        b_seq_start_tensor: torch.Tensor,
+        lora_index_tensor: torch.Tensor,
+        batch_mlength_lst: List[int],
+    ):
+        self.seq_length_tensor = seq_length_tensor
+        self.b_seq_start_tensor = b_seq_start_tensor
+        self.lora_index_tensor = lora_index_tensor
+        self.batch_mlength_list = batch_mlength_lst
+
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
-        _apply_lora(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            output,
-        )
+        batch_size = self.batch_mlength_list[0]
+        # maybe we need not  restrict  range to [:batch_size]
+        _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
+                           self.b_seq_start_tensor[:batch_size],
+                           self.seq_length_tensor[:batch_size],
+                           self.lora_index_tensor[:batch_size],
+                           self.batch_mlength_list, output)
         return output
 
+    # def apply(self, x: torch.Tensor) -> torch.Tensor:
+    #     output = self.base_layer.quant_method.apply(self.base_layer, x)
+    #     temp_output = output.clone()
+    #     output2 = output.clone()
+    #     mid_buffer,_=_apply_lora(
+    #         x,
+    #         self.lora_a_stacked,
+    #         self.lora_b_stacked,
+    #         self.indices[:self.indices_len[0]],
+    #         output,
+    #     )
+    #     batch_size = self.batch_mlength_list[0]
+    #     # print(f"self.indices[:self.indices_len[0]]={ self.indices[:self.indices_len[0]]},\
+    #     #     lora_index_tensor={self.lora_index_tensor[:batch_size]},batch={self.batch_mlength_list[0]}")
+    #     # #
+    #     mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
+    #                        self.b_seq_start_tensor[:batch_size],
+    #                        self.seq_length_tensor[:batch_size],
+    #                        self.lora_index_tensor[:batch_size],
+    #                        self.batch_mlength_list, output)
+    #     flag = torch.allclose(mid_buffer, mid2_buffer, 3e-2, 2e-2)
+    #     # if not flag:
+    #     #     print("error")
+    #     # else:
+    #     #     print("pass")
+    #     return temp_output
+
     def forward(self, input_):
         """Forward of RowParallelLinear
 
@@ -1103,6 +1333,11 @@ def create_lora_weights(
         self.indices_len: List[int]
         self.indices_padded: torch.Tensor
 
+        self.seq_length_tensor: torch.Tensor
+        self.b_seq_start_tensor: torch.Tensor
+        self.lora_index_tensor: torch.Tensor
+        self.batch_mlength_list: List[int]
+
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -1140,6 +1375,18 @@ def set_mapping(
         self.indices_padded = sampler_indices_padded
         self.indices_len = indices_len
 
+    def set_kernel_mapping(
+        self,
+        seq_length_tensor: torch.Tensor,
+        b_seq_start_tensor: torch.Tensor,
+        lora_index_tensor: torch.Tensor,
+        batch_mlength_lst: List[int],
+    ):
+        self.seq_length_tensor = seq_length_tensor
+        self.b_seq_start_tensor = b_seq_start_tensor
+        self.lora_index_tensor = lora_index_tensor
+        self.batch_mlength_list = batch_mlength_lst
+
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
@@ -1186,6 +1433,17 @@ def _get_logits(
             logits,
         )
 
+        # batch_size=self.batch_mlength_list[0]
+        # _apply_lora_triton(hidden_states, self.lora_a_stacked, self.lora_b_stacked,
+        #                    self.b_seq_start_tensor[:batch_size],
+        #                    self.seq_length_tensor[:batch_size],
+        #                    self.indices[:self.indices_len[1]],
+        #                    self.batch_mlength_list, logits_temp)
+        # flag=torch.allclose(logits_temp,logits,rtol=1e-2,atol=1e-2)
+        # if flag:
+        #     print("pass")
+        # else:
+        #     print("error")
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 3e82856866d8..392b8b4a6c51 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -81,7 +81,7 @@ def convert_mapping(
                 embeddings_indices, long_lora_indices). If long_lora doesn't
                 exist, it only contains first 4 entries.
     """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    index_mapping_indices: List[int] = list(mapping.batch_mapping).copy()
     embedding_indices = index_mapping_indices.copy()
     lora_indices = index_mapping_indices.copy()
     long_lora_offsets: Optional[torch.Tensor] = None
@@ -427,6 +427,19 @@ def __init__(
         # Dict instead of a Set for compatibility with LRUCache.
         self._active_loras: Dict[int, None] = {}
         self._last_mapping: Optional[LoRAMapping] = None
+
+        # triton kernel mapping
+
+        self.batch_mlength_lst = [-1] * 2
+        self.seq_length_tensor = torch.empty(self.max_num_batched_tokens,
+                                             dtype=torch.long,
+                                             device="cuda")
+        self.b_seq_start_tensor = torch.empty(self.max_num_batched_tokens,
+                                              dtype=torch.long,
+                                              device="cuda")
+        self.lora_index_tensor = torch.empty(self.max_num_batched_tokens,
+                                             dtype=torch.long,
+                                             device="cuda")
         self._create_lora_modules()
         self.model.lora_manager = self
 
@@ -548,6 +561,8 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         # Maintain the reference
         self.indices_len[:] = indices_len
 
+        
+
     def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
         if self._last_mapping != lora_mapping:
             self._set_lora_mapping(lora_mapping)
@@ -600,6 +615,10 @@ def _create_lora_modules(self):
                                    self.sampler_indices_padded,
                                    self.embeddings_indices,
                                    self.long_lora_indices, self.indices_len)
+            new_module.set_kernel_mapping(self.seq_length_tensor,
+                                          self.b_seq_start_tensor,
+                                          self.lora_index_tensor,
+                                          self.batch_mlength_lst)
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA)
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 354778926250..c68c551db89e 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -24,10 +24,11 @@ def _sgmv_expand_kernel(
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
     EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
 ):
     """
     The sgmv's expand triton kernel is based on GroupGEMM.
-    The GEMM of Multi-LoRA can be considered as GroupGEMM.
     """
     pid = tl.program_id(axis=0)
     cur_batch = tl.program_id(axis=1)
@@ -63,13 +64,16 @@ def _sgmv_expand_kernel(
             tiled_b = tl.load(b_ptr,
                               mask=offset_k[:, None] < K - k * BLOCK_K,
                               other=0)
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+        #TODO Can I use D=A@B+C ?
         accumulator += tl.dot(
             tiled_a,
             tiled_b,
         )
         a_ptr += BLOCK_K * xk_stride
         b_ptr += BLOCK_K * lora_n_stride
-    tiled_c = accumulator.to(input_ptr.dtype.element_ty)
+    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
@@ -77,6 +81,9 @@ def _sgmv_expand_kernel(
     M = tl.load(seq_lens + cur_batch)
     c_mask = (offset_cm[:, None] <
               (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        tiled_c += tiled_out
     tl.store(c_ptr, tiled_c, mask=c_mask)
 
 
@@ -90,9 +97,11 @@ def sgmv_expand(
     lora_indices_tensor: torch.Tensor,
     batchs: int,
     max_seq_length: int,
+    add_inputs: bool = False,
 ):
     """_summary_
 
+
     Args:
         inputs (torch.Tensor): input tensor
         lora_b_weights (torch.Tensor): lora'a weight
@@ -108,27 +117,48 @@ def sgmv_expand(
         batchs (int): batch size
         max_seq_length (int):  The max sequence lengths of the sequences
             in the batch
+        add_inputs (bool, optional): _description_. Defaults to False.
+        cast_type (bool, optional): _description_. Defaults to False.
     """
-    assert inputs.dtype == lora_b_weights.dtype
+
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
     assert inputs.size(1) == lora_b_weights.size(-1)
     assert b_seq_start_loc.size(0) == batchs
     assert lora_indices_tensor.size(0) == batchs
     assert inputs.is_contiguous()
-    assert lora_b_weights.is_contiguous()
     assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
     # TODO tuning this config
-    _, N, K = lora_b_weights.shape  # K= rank,N=hidden_size
+
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
     BLOCK_M = 32
     BLOCK_N = 32
     BLOCK_K = 16
     EVEN_K = K % BLOCK_K == 0
-
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
     grid = [
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         batchs,
     ]
-
     _sgmv_expand_kernel[grid](
         inputs,
         lora_b_weights,
@@ -149,5 +179,7 @@ def sgmv_expand(
         BLOCK_N,
         BLOCK_K,
         EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
     )
     return
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
new file mode 100644
index 000000000000..a8d93aa196a2
--- /dev/null
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -0,0 +1,207 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _sgmv_expand_slice_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    b_seq_start_loc,
+    seq_lens,
+    lora_indices,
+    xm_stride,
+    xk_stride,  # 1
+    l0_stride,  # hidden_size*max_rank
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    slice_offset,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    """
+
+    Similar to the 'sgmv_expand' operator, but with an added parameter 
+    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator 
+    might be that in the future, we could implement  a fusion  operator to 
+    achieve the current functionality instead of having to call it multiple 
+    times.
+    """
+    pid = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid // cta_n_num
+    pid_n = pid % cta_n_num
+    M = tl.load(seq_lens + cur_batch)
+    if pid_m * BLOCK_M > M:
+        return
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
+    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = tl.arange(0, BLOCK_K)
+    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    a_ptr = (
+        input_ptr
+        + cur_seq_start * xm_stride
+        + ram[:, None] * xm_stride
+        + offset_k[None, :] * xk_stride,
+    )
+    b_ptr = (
+        lora_ptr
+        + l0_stride * lora_index
+        + offset_k[:, None] * lora_n_stride
+        + rbn[None, :] * lora_k_stride
+    )
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            tiled_a = tl.load(
+                a_ptr, mask=offset_k[None, :] < K - k * BLOCK_K, other=0
+            )
+            tiled_b = tl.load(
+                b_ptr, mask=offset_k[:, None] < K - k * BLOCK_K, other=0
+            )
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+        # TODO Can I use D=A@B+C ?
+        accumulator += tl.dot(
+            tiled_a,
+            tiled_b,
+        )
+        a_ptr += BLOCK_K * xk_stride
+        b_ptr += BLOCK_K * lora_n_stride
+    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
+    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N+slice_offset
+    c_ptr = (
+        out_ptr
+        + offset_cm[:, None] * cm_stride
+        + offset_cn[None, :] * cn_stride
+    )
+    M = tl.load(seq_lens + cur_batch)
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
+        offset_cn[None, :] < (slice_offset+N)
+    )
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        tiled_c += tiled_out
+    tl.store(c_ptr, tiled_c, mask=c_mask)
+
+
+@torch.inference_mode()
+def sgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batchs: int,
+    max_seq_length: int,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = False,
+):
+    """_summary_
+
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4, 10].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batchs (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        slice_offst (int): output_tensor's offst
+        slice_size (int): current output_tensor's size
+        add_inputs (bool, optional): _description_. Defaults to False.
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+    assert b_seq_start_loc.size(0) == batchs
+    assert lora_indices_tensor.size(0) == batchs
+    assert slice_size==lora_b_weights.size(-2)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_K = 16
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    grid = [
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        batchs,
+    ]
+    _sgmv_expand_slice_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        slice_offset,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index d3858d91791e..1b7cf0f3caa6 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -28,8 +28,8 @@ def _sgmv_shrink_kernel(
     SPLIT_K: tl.constexpr,
 ):
     """
-    The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K. 
-    The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally, 
+    The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K.
+    The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally,
     introducing SPLIT-K can improve performance
     """
     pid = tl.program_id(axis=0)
@@ -43,8 +43,6 @@ def _sgmv_shrink_kernel(
     if pid_m * BLOCK_M > M:
         return
     lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
     cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
     offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
@@ -57,6 +55,7 @@ def _sgmv_shrink_kernel(
              offset_k[None, :] * xk_stride)
     b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
              offset_k[:, None] * lora_n_stride)
+
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
         if EVEN_K:
@@ -67,13 +66,14 @@ def _sgmv_shrink_kernel(
             a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0)
             b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0)
         accumulator += tl.dot(a, b)
+
         a_ptr += BLOCK_K * SPLIT_K * xk_stride
         b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
              offset_cn[None, :] * cn_stride)
-
     c_mask = (offset_cm[:, None] <
               (cur_seq_start + M)) & (offset_cn[None, :] < N)
     accumulator *= scaling
@@ -121,16 +121,21 @@ def sgmv_shrink(
     assert b_seq_start_loc.size(0) == batchs
     assert lora_indices_tensor.size(0) == batchs
     assert inputs.is_contiguous()
+
+    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
+        assert lora_a_weights.size(1) == 1
+        lora_a_weights = lora_a_weights.squeeze(dim=1)
+    else:
+        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
     assert lora_a_weights.is_contiguous()
     assert output_tensor.is_contiguous()
     # TODO tuning this config
-    _, N, K = lora_a_weights.shape  # K=hidden_size,N=rank
+    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
     BLOCK_M = 32
-    BLOCK_N = 32
+    BLOCK_N = 16
     BLOCK_K = 32
-    SPLIT_K = 8
-    EVEN_K = K % (SPLIT_K * BLOCK_K) == 0
-
+    SPLIT_K = 1
+    EVEN_K = False
     grid = [
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         SPLIT_K,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 1e6cb83f719d..fe7319f93b96 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,8 +4,9 @@
 
 import torch
 
-# from vllm.lora.ops.sgmv_expand import sgmv_expand
-# from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 
 
 def _raise_import_error(e):
@@ -52,10 +53,16 @@ def bgmv(
     punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
 
 
-def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
-                            w_t_all: torch.Tensor, indicies: torch.LongTensor,
-                            layer_idx: int, scale: float, y_offset: int,
-                            y_slice_size: int):
+def dispatch_bgmv_low_level(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+    y_offset: int,
+    y_slice_size: int,
+):
     """
     Same as `bgmv` but you can operate on slices of y.
     Pass whole y, define y_offset and y_slice_size.
@@ -95,15 +102,17 @@ def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
     )
 
 
-def add_lora(y: torch.Tensor,
-             x: torch.Tensor,
-             wa_t_all: torch.Tensor,
-             wb_t_all: torch.Tensor,
-             indicies: torch.LongTensor,
-             layer_idx: int,
-             scale: float,
-             *,
-             buffer: Optional[torch.Tensor] = None):
+def add_lora(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+    *,
+    buffer: Optional[torch.Tensor] = None,
+):
     """
     Semantics:
       y[i] += (
@@ -141,19 +150,70 @@ def add_lora(y: torch.Tensor,
     punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
     punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
                                  scale)
+    return buffer
 
 
-def add_lora_slice(y: torch.Tensor,
-                   x: torch.Tensor,
-                   wa_t_all: torch.Tensor,
-                   wb_t_all: torch.Tensor,
-                   indicies: torch.LongTensor,
-                   layer_idx: int,
-                   scale: float,
-                   y_offset: int,
-                   y_slice_size: int,
-                   *,
-                   buffer: Optional[torch.Tensor] = None):
+def add_lora_triton(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    b_seq_start_tensor: torch.Tensor,
+    seq_length_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batch_size: int,
+    max_length: int,
+    layer_idx: int,
+    scale: float,
+    *,
+    buffer: Optional[torch.Tensor] = None,
+):
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default to avoid
+        # numerical inaccuracies that would otherwise happen
+        # due to downcasting.
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
+    sgmv_shrink(
+        x,
+        wa_t_all,
+        buffer,
+        b_seq_start_tensor,
+        seq_length_tensor,
+        lora_indices_tensor,
+        batch_size,
+        max_length,
+        scale,
+    )
+    sgmv_expand(
+        buffer,
+        wb_t_all,
+        y,
+        b_seq_start_tensor,
+        seq_length_tensor,
+        lora_indices_tensor,
+        batch_size,
+        max_length,
+        add_inputs=True,
+    )
+    return buffer
+
+
+def add_lora_slice(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+    y_offset: int,
+    y_slice_size: int,
+    *,
+    buffer: Optional[torch.Tensor] = None,
+):
     """
     Same as `add_lora` but you can operate on slices of y.
     Pass whole y, define y_offset and y_slice_size.
@@ -214,3 +274,84 @@ def add_lora_slice(y: torch.Tensor,
         y_slice_size,
         y_offset,
     )
+
+
+def add_lora_triton_slice(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    b_seq_start_tensor: torch.Tensor,
+    seq_length_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batch_size: int,
+    max_length: int,
+    layer_idx: int,
+    scale: float,
+    y_offset: int,
+    y_slice_size: int,
+    *,
+    buffer: Optional[torch.Tensor] = None,
+):
+    """
+    Same as `add_lora` but you can operate on slices of y.
+    Pass whole y, define y_offset and y_slice_size.
+
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+        LoRA A matrices.
+      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+        LoRA B matrices.
+      indicies: Shape: `[B]`. Indices of the LoRA weights.
+      layer_idx: Layer index of LoRA weights.
+      scale: Scaling factor.
+      y_offset: Offset to apply to the starting column of y.
+      y_slice_size: Size of the y column slice.
+    # """
+    # try:
+    #     import vllm._punica_C as punica_kernels
+    # except ImportError as e:
+    #     _raise_import_error(e)
+
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default to avoid
+        # numerical inaccuracies that would otherwise happen
+        # due to downcasting.
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
+    sgmv_shrink(
+        x,
+        wa_t_all,
+        buffer,
+        b_seq_start_tensor,
+        seq_length_tensor,
+        lora_indices_tensor,
+        batch_size,
+        max_length,
+        scale,
+    )
+    sgmv_expand_slice(
+        buffer,
+        wb_t_all,
+        y,
+        b_seq_start_tensor,
+        seq_length_tensor,
+        lora_indices_tensor,
+        batch_size,
+        max_length,
+        y_offset,
+        y_slice_size,
+        add_inputs=True,
+    )
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 87d5f5c1b9d6..d6713b59944e 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -233,6 +233,7 @@ def _prepare_model_input(
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         lora_index_mapping: List[int] = []
+        batch_lora_index_mapping: List[int] = []
         lora_prompt_mapping: List[int] = []
         lora_requests: Set[LoRARequest] = set()
 
@@ -386,6 +387,7 @@ def _prepare_model_input(
                     lora_requests.add(seq_group_metadata.lora_request)
 
                 lora_index_mapping += [lora_id] * (seq_len - context_len)
+                batch_lora_index_mapping += [lora_id if lora_id > 0 else -1]
                 lora_prompt_mapping.extend(
                     [lora_id] *
                     (seq_len -
@@ -586,9 +588,9 @@ def _prepare_model_input(
 
         if self.lora_config:
             lora_mapping = LoRAMapping(
-                lora_index_mapping,
-                lora_prompt_mapping,
-            )
+                lora_index_mapping, lora_prompt_mapping,
+                batch_lora_index_mapping, query_lens,
+                bool(attn_metadata.prefill_metadata))
         else:
             lora_mapping = None
 
@@ -788,6 +790,32 @@ def profile_run(self) -> None:
         torch.cuda.synchronize()
         return
 
+    # def compose_lora_kernel_meta(
+    #     self,
+    #     attn_metadata: AttentionMetadata,
+    # ) -> LoRAKernelMeta:
+    #     if attn_metadata.prefill_metadata:
+    #         max_seq_len = attn_metadata.max_query_len
+    #         seq_start_loc = attn_metadata.query_start_loc
+    #         seq_lens_tensor = attn_metadata.seq_lens_tensor
+    #         batch_size = attn_metadata.num_prefills
+    #     else:
+    #         max_seq_len = attn_metadata.max_query_len
+    #         seq_start_loc = attn_metadata.query_start_loc
+    #         batch_size = attn_metadata.decode_metadata.num_decode_tokens
+    #         seq_lens_tensor = torch.ones((batch_size),
+    #                                      dtype=torch.long,
+    #                                      device=self.device)
+
+    #     if batch_size == 0:
+    #         print("sssss")
+    #     # lora_index_lst = lora_mapping.batch_mapping
+    #     # lora_index_tensor = torch.tensor(lora_index_lst,
+    #     #                                  dtype=torch.long,
+    #     #                                  device=self.device)
+    #     return LoRAKernelMeta(batch_size, max_seq_len, seq_lens_tensor,
+    #                           seq_start_loc)
+
     def remove_all_loras(self):
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
@@ -881,6 +909,9 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                     lora_mapping = LoRAMapping(
                         [0] * batch_size,
                         [0] * batch_size,
+                        [0] * batch_size,
+                        [1] * batch_size,
+                        False
                     )
                     self.set_active_loras(set(), lora_mapping)
 

From fad4b033cf7a49ce5a5902741feec96742044e87 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 29 May 2024 00:33:54 +0800
Subject: [PATCH 05/71] start replacing bgmv

---
 vllm/lora/layers.py                | 18 +++++------
 vllm/lora/models.py                | 34 ++++++++++++++++++--
 vllm/lora/ops/sgmv_expand_slice.py | 50 ++++++++++++------------------
 vllm/lora/ops/sgmv_shrink.py       |  2 +-
 vllm/lora/punica.py                |  2 --
 vllm/worker/model_runner.py        | 18 +++++------
 6 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 5e4f648f3788..68127fd5fe61 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,6 +1,6 @@
 # pylint: disable=unused-argument
 import math
-from dataclasses import dataclass,field
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -127,10 +127,10 @@ def _apply_lora_triton(
     batch_size = batch_mlength_lst[0]
     max_length = batch_mlength_lst[1]
 
-    buffer = add_lora_triton(output, x, lora_a_stacked, lora_b_stacked,
+    add_lora_triton(output, x, lora_a_stacked, lora_b_stacked,
                              b_seq_start_tensor, seq_length_tensor,
                              lora_index_tensor, batch_size, max_length, 0, 1.0)
-    return buffer, output.view_as(org_output)
+    return  output.view_as(org_output)
 
 
 def _apply_lora_packed_nslice(
@@ -227,11 +227,11 @@ class LoRAMapping:
     # Per sampled token:
     prompt_mapping: Tuple[int, ...]
     # Per batch lora index
-    batch_mapping: List[int]=field(default_factory=list)
+    batch_mapping: List[int] = field(default_factory=list)
     # Per batch seq length
-    seq_lens: List[int]=field(default_factory=list)
+    seq_lens: List[int] = field(default_factory=list)
     # prefilling or  decoding.
-    is_prefilling: bool=False
+    is_prefilling: bool = False
 
     def __post_init__(self):
         self.index_mapping = tuple(self.index_mapping)
@@ -1188,10 +1188,8 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
     #         output,
     #     )
     #     batch_size = self.batch_mlength_list[0]
-    #     # print(f"self.indices[:self.indices_len[0]]={ self.indices[:self.indices_len[0]]},\
-    #     #     lora_index_tensor={self.lora_index_tensor[:batch_size]},batch={self.batch_mlength_list[0]}")
-    #     # #
-    #     mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
+    #     mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, 
+    #                         self.lora_b_stacked,
     #                        self.b_seq_start_tensor[:batch_size],
     #                        self.seq_length_tensor[:batch_size],
     #                        self.lora_index_tensor[:batch_size],
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 392b8b4a6c51..96e2e51bd93e 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -81,7 +81,7 @@ def convert_mapping(
                 embeddings_indices, long_lora_indices). If long_lora doesn't
                 exist, it only contains first 4 entries.
     """
-    index_mapping_indices: List[int] = list(mapping.batch_mapping).copy()
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
     embedding_indices = index_mapping_indices.copy()
     lora_indices = index_mapping_indices.copy()
     long_lora_offsets: Optional[torch.Tensor] = None
@@ -434,7 +434,7 @@ def __init__(
         self.seq_length_tensor = torch.empty(self.max_num_batched_tokens,
                                              dtype=torch.long,
                                              device="cuda")
-        self.b_seq_start_tensor = torch.empty(self.max_num_batched_tokens,
+        self.b_seq_start_tensor = torch.zeros(self.max_num_batched_tokens,
                                               dtype=torch.long,
                                               device="cuda")
         self.lora_index_tensor = torch.empty(self.max_num_batched_tokens,
@@ -561,7 +561,35 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         # Maintain the reference
         self.indices_len[:] = indices_len
 
-        
+        if mapping.seq_lens:
+            batchs = len(mapping.seq_lens)
+            seq_length_tensor = torch.tensor(mapping.seq_lens,
+                                             dtype=torch.long,
+                                             device="cuda")
+            self.seq_length_tensor[:batchs].copy_(seq_length_tensor)
+            # b_seq_start_tensor = torch.zeros(seq_length_tensor.shape[0] + 1,
+            #                                  dtype=torch.long,
+            #                                  device="cuda")
+            # torch.cumsum(seq_length_tensor,
+            #              dim=0,
+            #              dtype=seq_length_tensor.dtype,
+            #              out=b_seq_start_tensor[1:])
+            torch.cumsum(seq_length_tensor,
+                         dim=0,
+                         dtype=seq_length_tensor.dtype,
+                         out=self.b_seq_start_tensor[1:])
+            # self.b_seq_start_tensor[:batchs].copy_(b_seq_start_tensor)
+            lora_id_lst = []
+            for lora_index in mapping.batch_mapping:
+                lora_id_lst.append(
+                    self.lora_index_to_id.index(lora_index
+                                                ) if lora_index > 0 else -1)
+            lora_id_tensor = torch.tensor(lora_id_lst,
+                                          dtype=torch.long,
+                                          device="cuda")
+            self.lora_index_tensor[:lora_id_tensor.size(0)].copy_(
+                lora_id_tensor)
+            self.batch_mlength_lst[:] = [batchs, max(mapping.seq_lens)]
 
     def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
         if self._last_mapping != lora_mapping:
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index a8d93aa196a2..b0bf8015431e 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -54,30 +54,22 @@ def _sgmv_expand_slice_kernel(
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
 
-    a_ptr = (
-        input_ptr
-        + cur_seq_start * xm_stride
-        + ram[:, None] * xm_stride
-        + offset_k[None, :] * xk_stride,
-    )
-    b_ptr = (
-        lora_ptr
-        + l0_stride * lora_index
-        + offset_k[:, None] * lora_n_stride
-        + rbn[None, :] * lora_k_stride
-    )
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride, )
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(tl.cdiv(K, BLOCK_K)):
         if EVEN_K:
             tiled_a = tl.load(a_ptr)
             tiled_b = tl.load(b_ptr)
         else:
-            tiled_a = tl.load(
-                a_ptr, mask=offset_k[None, :] < K - k * BLOCK_K, other=0
-            )
-            tiled_b = tl.load(
-                b_ptr, mask=offset_k[:, None] < K - k * BLOCK_K, other=0
-            )
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < K - k * BLOCK_K,
+                              other=0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < K - k * BLOCK_K,
+                              other=0)
         if CAST_TYPE:
             tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
         # TODO Can I use D=A@B+C ?
@@ -89,16 +81,12 @@ def _sgmv_expand_slice_kernel(
         b_ptr += BLOCK_K * lora_n_stride
     tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N+slice_offset
-    c_ptr = (
-        out_ptr
-        + offset_cm[:, None] * cm_stride
-        + offset_cn[None, :] * cn_stride
-    )
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_offset
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
     M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
-        offset_cn[None, :] < (slice_offset+N)
-    )
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] <
+                                                           (slice_offset + N))
     if ADD_INPUTS:
         tiled_out = tl.load(c_ptr, mask=c_mask)
         tiled_c += tiled_out
@@ -150,7 +138,7 @@ def sgmv_expand_slice(
     assert inputs.size(1) == lora_b_weights.size(-1)
     assert b_seq_start_loc.size(0) == batchs
     assert lora_indices_tensor.size(0) == batchs
-    assert slice_size==lora_b_weights.size(-2)
+    assert slice_size == lora_b_weights.size(-2)
     assert inputs.is_contiguous()
     assert output_tensor.is_contiguous()
 
@@ -158,7 +146,7 @@ def sgmv_expand_slice(
         assert lora_b_weights.size(1) == 1
         lora_b_weights = lora_b_weights.squeeze(dim=1)
     else:
-        assert lora_b_weights.ndim == 3 # shape:(lora_num,size,rank)
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
 
     assert lora_b_weights.is_contiguous()
 
@@ -172,8 +160,8 @@ def sgmv_expand_slice(
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
+            torch.float16,
+            torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = [
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 1b7cf0f3caa6..b8d0d8a23c8c 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -134,7 +134,7 @@ def sgmv_shrink(
     BLOCK_M = 32
     BLOCK_N = 16
     BLOCK_K = 32
-    SPLIT_K = 1
+    SPLIT_K = 16
     EVEN_K = False
     grid = [
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index fe7319f93b96..4f4fccca8051 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -150,7 +150,6 @@ def add_lora(
     punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
     punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
                                  scale)
-    return buffer
 
 
 def add_lora_triton(
@@ -198,7 +197,6 @@ def add_lora_triton(
         max_length,
         add_inputs=True,
     )
-    return buffer
 
 
 def add_lora_slice(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index d6713b59944e..aaa8a66c40ab 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -587,10 +587,9 @@ def _prepare_model_input(
             )
 
         if self.lora_config:
-            lora_mapping = LoRAMapping(
-                lora_index_mapping, lora_prompt_mapping,
-                batch_lora_index_mapping, query_lens,
-                bool(attn_metadata.prefill_metadata))
+            lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping,
+                                       batch_lora_index_mapping, query_lens,
+                                       bool(attn_metadata.prefill_metadata))
         else:
             lora_mapping = None
 
@@ -906,13 +905,10 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                 )
 
                 if self.lora_config:
-                    lora_mapping = LoRAMapping(
-                        [0] * batch_size,
-                        [0] * batch_size,
-                        [0] * batch_size,
-                        [1] * batch_size,
-                        False
-                    )
+                    lora_mapping = LoRAMapping([0] * batch_size,
+                                               [0] * batch_size,
+                                               [0] * batch_size,
+                                               [1] * batch_size, False)
                     self.set_active_loras(set(), lora_mapping)
 
                 graph_runner = CUDAGraphRunner(self.model)

From 40d449abb6992662d237da51773a50376688c3a9 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 29 May 2024 17:23:36 +0800
Subject: [PATCH 06/71] backup

---
 tests/lora/test_triton_sgmv.py     |  69 ++++++++-
 vllm/lora/layers.py                | 220 +++++++++++------------------
 vllm/lora/models.py                |  95 +++++++------
 vllm/lora/ops/sgmv_expand.py       |   7 +
 vllm/lora/ops/sgmv_expand_slice.py |   7 +
 vllm/lora/ops/sgmv_shrink.py       |   7 +
 6 files changed, 215 insertions(+), 190 deletions(-)

diff --git a/tests/lora/test_triton_sgmv.py b/tests/lora/test_triton_sgmv.py
index d0903f76cd37..db3739f35d24 100644
--- a/tests/lora/test_triton_sgmv.py
+++ b/tests/lora/test_triton_sgmv.py
@@ -6,6 +6,7 @@
 import vllm.lora.punica as punica
 from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 
 # The current punica kernel supports dimension and adds a dimension of 3424.
 HIDDEN_SIZES = [
@@ -327,5 +328,69 @@ def test_sgmv_punica_bgmv(
     assert_close(our_out_tensor, ref_out_tensor)
 
 
-# if __name__ == "__main__":
-#     pytest.main(["test_triton_sgmv.py::test_sgmv_torch"])
+@pytest.mark.skip("TODO")
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sgmv_expand_nslice(
+    hidden_size,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
+    if dtype == torch.float32 or hidden_size == 3424:
+        return
+    torch.manual_seed(seed)
+    batchs = 4  # Arbitrary values for testing
+    rank = 16
+    seq_len = 333  # Arbitrary values for testing
+    num_loras = 8  # Arbitrary values for testing
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+                       op_type, device)
+
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+
+    sgmv_expand_slice(
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        batchs,
+        max_seq_length,
+        1024,
+        add_inputs=True,
+    )
+    lora_weights_4d = lora_weights.unsqueeze(dim=1)
+    _punica_bgmv(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights_4d,
+        indices,
+        scaling if op_type == "shrink" else 1.0,
+    )
+
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+if __name__ == "__main__":
+    pytest.main(["test_triton_sgmv.py::test_sgmv_expand_nslice"])
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 68127fd5fe61..a3a40ad0bd24 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -102,7 +102,7 @@ def _apply_lora_triton(
     b_seq_start_tensor: torch.Tensor,
     seq_length_tensor: torch.Tensor,
     lora_index_tensor: torch.Tensor,
-    batch_mlength_lst: List[int],
+    batch_mlen_stage_lst: List[int],
     output: torch.Tensor,
 ):
     # """Applies lora to each input.
@@ -124,13 +124,13 @@ def _apply_lora_triton(
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
 
-    batch_size = batch_mlength_lst[0]
-    max_length = batch_mlength_lst[1]
+    batch_size = batch_mlen_stage_lst[0]
+    max_length = batch_mlen_stage_lst[1]
 
     add_lora_triton(output, x, lora_a_stacked, lora_b_stacked,
-                             b_seq_start_tensor, seq_length_tensor,
-                             lora_index_tensor, batch_size, max_length, 0, 1.0)
-    return  output.view_as(org_output)
+                    b_seq_start_tensor, seq_length_tensor, lora_index_tensor,
+                    batch_size, max_length, 0, 1.0)
+    return output.view_as(org_output)
 
 
 def _apply_lora_packed_nslice(
@@ -181,7 +181,7 @@ def _apply_lora_triton_nslice(
     b_seq_start_tensor: torch.Tensor,
     seq_length_tensor: torch.Tensor,
     lora_index_tensor: torch.Tensor,
-    batch_mlength_lst: List[int],
+    batch_mlen_stage_lst: List[int],
     output: torch.Tensor,
     output_slices: Tuple[int, ...],
 ):
@@ -204,8 +204,8 @@ def _apply_lora_triton_nslice(
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
 
-    batch_size = batch_mlength_lst[0]
-    max_length = batch_mlength_lst[1]
+    batch_size = batch_mlen_stage_lst[0]
+    max_length = batch_mlen_stage_lst[1]
 
     offset_left = 0
     #TODO fuse these kernel
@@ -275,24 +275,14 @@ def set_lora(
         ...
 
     def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
+            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
+            sampler_indices_padded: torch.Tensor,
+            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
+            indices_len: List[int], seq_length_tensor: torch.Tensor,
+            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
         """Sets the mapping indices."""
         ...
 
-    def set_kernel_mapping(self, seq_length_tensor: torch.Tensor,
-                           b_seq_start_tensor: torch.Tensor,
-                           lora_index_tensor: torch.Tensor,
-                           batch_mlength_lst: List[int]):
-        """Sets the kernel mapping"""
-        ...
-
     @classmethod
     def can_replace_layer(cls, source_layer: nn.Module,
                           lora_config: LoRAConfig, packed_modules_list: List,
@@ -372,8 +362,7 @@ def create_lora_weights(
 
         self.seq_length_tensor: torch.Tensor
         self.b_seq_start_tensor: torch.Tensor
-        self.lora_index_tensor: torch.Tensor
-        self.batch_mlength_list: List[int]
+        self.batch_mlen_stage_lst: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -409,29 +398,17 @@ def set_lora(
                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
 
     def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
+            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
+            sampler_indices_padded: torch.Tensor,
+            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
+            indices_len: List[int], seq_length_tensor: torch.Tensor,
+            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
         self.indices = base_indices
         self.embeddings_indices = embeddings_indices
         self.indices_len = indices_len
-
-    def set_kernel_mapping(
-        self,
-        seq_length_tensor: torch.Tensor,
-        b_seq_start_tensor: torch.Tensor,
-        lora_index_tensor: torch.Tensor,
-        batch_mlength_lst: List[int],
-    ):
         self.seq_length_tensor = seq_length_tensor
         self.b_seq_start_tensor = b_seq_start_tensor
-        self.lora_index_tensor = lora_index_tensor
-        self.batch_mlength_list = batch_mlength_lst
+        self.batch_mlen_stage_lst = batch_mlen_stage_lst
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
@@ -453,17 +430,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
                 full_lora_a_embeddings.shape[0] *
                 full_lora_a_embeddings.shape[1], -1)
-        batchs, max_length = self.batch_mlength_list[
-            0], self.batch_mlength_list[1]
+        batch_size, max_length = self.batch_mlen_stage_lst[
+            0], self.batch_mlen_stage_lst[1]
 
         sgmv_expand(
             full_lora_a_embeddings,
             self.lora_b_stacked,
             full_output,
-            self.b_seq_start_tensor[:batchs],
-            self.seq_length_tensor[:batchs],
-            self.lora_index_tensor[:batchs],
-            batchs,
+            self.b_seq_start_tensor[:batch_size],
+            self.seq_length_tensor[:batch_size],
+            self.indices[:batch_size],
+            batch_size,
             max_length,
             True,
         )
@@ -524,8 +501,7 @@ def create_lora_weights(
         self.indices_len: List[int]
         self.seq_length_tensor: torch.Tensor
         self.b_seq_start_tensor: torch.Tensor
-        self.lora_index_tensor: torch.Tensor
-        self.batch_mlength_list: List[int]
+        self.batch_mlen_stage_lst: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -563,39 +539,27 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
 
     def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
+            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
+            sampler_indices_padded: torch.Tensor,
+            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
+            indices_len: List[int], seq_length_tensor: torch.Tensor,
+            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
         self.indices = base_indices
         self.indices_len = indices_len
-
-    def set_kernel_mapping(
-        self,
-        seq_length_tensor: torch.Tensor,
-        b_seq_start_tensor: torch.Tensor,
-        lora_index_tensor: torch.Tensor,
-        batch_mlength_lst: List[int],
-    ):
         self.seq_length_tensor = seq_length_tensor
         self.b_seq_start_tensor = b_seq_start_tensor
-        self.lora_index_tensor = lora_index_tensor
-        self.batch_mlength_list = batch_mlength_lst
+        self.batch_mlen_stage_lst = batch_mlen_stage_lst
 
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        batch_size = self.batch_mlength_list[0]
+        batch_size = self.batch_mlen_stage_lst[0]
         # maybe we need not  restrict  range to [:batch_size]
         _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
                            self.b_seq_start_tensor[:batch_size],
                            self.seq_length_tensor[:batch_size],
-                           self.lora_index_tensor[:batch_size],
-                           self.batch_mlength_list, output)
+                           self.indices[:batch_size], self.batch_mlen_stage_lst,
+                           output)
         return output
 
     def forward(self, input_):
@@ -686,11 +650,12 @@ def create_lora_weights(
         self.output_dim = self.lora_b_stacked[0].shape[2]
         # Lazily initialized.
         self.indices: torch.Tensor
+        self.indices_len: torch.Tensor
 
         self.seq_length_tensor: torch.Tensor
         self.b_seq_start_tensor: torch.Tensor
         self.lora_index_tensor: torch.Tensor
-        self.batch_mlength_list: List[int]
+        self.batch_mlen_stage_lst: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -756,15 +721,15 @@ def apply(self, x: torch.Tensor,
         #     output,
         #     (self.output_dim, self.output_dim),
         # )
-        batchs = self.batch_mlength_list[0]
+        batch_size = self.batch_mlen_stage_lst[0]
         _apply_lora_triton_nslice(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
-            self.b_seq_start_tensor[:batchs],
-            self.seq_length_tensor[:batchs],
-            self.lora_index_tensor[:batchs],
-            self.batch_mlength_list,
+            self.b_seq_start_tensor[:batch_size],
+            self.seq_length_tensor[:batch_size],
+            self.indices[:batch_size],
+            self.batch_mlen_stage_lst,
             output,
             (self.output_dim, self.output_dim),
         )
@@ -940,12 +905,12 @@ def create_lora_weights(
         self.packed_indices: Optional[torch.Tensor] = None
         self.standard_indices: Optional[torch.Tensor] = None
         # lazily initialized.
+        self.indices: torch.Tensor
         self.indices_len: List[int]
 
         self.seq_length_tensor: torch.Tensor
         self.b_seq_start_tensor: torch.Tensor
-        self.lora_index_tensor: torch.Tensor
-        self.batch_mlength_list: List[int]
+        self.batch_mlen_stage_lst: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -1032,15 +997,15 @@ def apply(self, x: torch.Tensor,
         #     output,
         #     self.output_slices,
         # )
-        batchs = self.batch_mlength_list[0]
+        batch_size = self.batch_mlen_stage_lst[0]
         _apply_lora_triton_nslice(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
-            self.b_seq_start_tensor[:batchs],
-            self.seq_length_tensor[:batchs],
-            self.lora_index_tensor[:batchs],
-            self.batch_mlength_list,
+            self.b_seq_start_tensor[:batch_size],
+            self.seq_length_tensor[:batch_size],
+            self.indices[:batch_size],
+            self.batch_mlen_stage_lst,
             output,
             self.output_slices,
         )
@@ -1104,7 +1069,7 @@ def create_lora_weights(
         self.seq_length_tensor: torch.Tensor
         self.b_seq_start_tensor: torch.Tensor
         self.lora_index_tensor: torch.Tensor
-        self.batch_mlength_list: List[int]
+        self.batch_mlen_stage_lst: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -1142,38 +1107,26 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
 
     def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
+            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
+            sampler_indices_padded: torch.Tensor,
+            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
+            indices_len: List[int], seq_length_tensor: torch.Tensor,
+            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
         self.indices = base_indices
         self.indices_len = indices_len
-
-    def set_kernel_mapping(
-        self,
-        seq_length_tensor: torch.Tensor,
-        b_seq_start_tensor: torch.Tensor,
-        lora_index_tensor: torch.Tensor,
-        batch_mlength_lst: List[int],
-    ):
         self.seq_length_tensor = seq_length_tensor
         self.b_seq_start_tensor = b_seq_start_tensor
-        self.lora_index_tensor = lora_index_tensor
-        self.batch_mlength_list = batch_mlength_lst
+        self.batch_mlen_stage_lst = batch_mlen_stage_lst
 
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
-        batch_size = self.batch_mlength_list[0]
+        batch_size = self.batch_mlen_stage_lst[0]
         # maybe we need not  restrict  range to [:batch_size]
         _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
                            self.b_seq_start_tensor[:batch_size],
                            self.seq_length_tensor[:batch_size],
-                           self.lora_index_tensor[:batch_size],
-                           self.batch_mlength_list, output)
+                           self.indices[:batch_size], self.batch_mlen_stage_lst,
+                           output)
         return output
 
     # def apply(self, x: torch.Tensor) -> torch.Tensor:
@@ -1187,13 +1140,13 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
     #         self.indices[:self.indices_len[0]],
     #         output,
     #     )
-    #     batch_size = self.batch_mlength_list[0]
-    #     mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked, 
+    #     batch_size = self.batch_mlen_stage_lst[0]
+    #     mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked,
     #                         self.lora_b_stacked,
     #                        self.b_seq_start_tensor[:batch_size],
     #                        self.seq_length_tensor[:batch_size],
-    #                        self.lora_index_tensor[:batch_size],
-    #                        self.batch_mlength_list, output)
+    #                        self.indices[:batch_size],
+    #                        self.batch_mlen_stage_lst, output)
     #     flag = torch.allclose(mid_buffer, mid2_buffer, 3e-2, 2e-2)
     #     # if not flag:
     #     #     print("error")
@@ -1334,7 +1287,7 @@ def create_lora_weights(
         self.seq_length_tensor: torch.Tensor
         self.b_seq_start_tensor: torch.Tensor
         self.lora_index_tensor: torch.Tensor
-        self.batch_mlength_list: List[int]
+        self.batch_mlen_stage_lst: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -1361,29 +1314,17 @@ def set_lora(
                 shape[1], ] = embeddings_tensor
 
     def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
+            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
+            sampler_indices_padded: torch.Tensor,
+            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
+            indices_len: List[int], seq_length_tensor: torch.Tensor,
+            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
         self.indices = sampler_indices
         self.indices_padded = sampler_indices_padded
         self.indices_len = indices_len
-
-    def set_kernel_mapping(
-        self,
-        seq_length_tensor: torch.Tensor,
-        b_seq_start_tensor: torch.Tensor,
-        lora_index_tensor: torch.Tensor,
-        batch_mlength_lst: List[int],
-    ):
         self.seq_length_tensor = seq_length_tensor
         self.b_seq_start_tensor = b_seq_start_tensor
-        self.lora_index_tensor = lora_index_tensor
-        self.batch_mlength_list = batch_mlength_lst
+        self.batch_mlen_stage_lst = batch_mlen_stage_lst
 
     def _get_logits(
         self,
@@ -1431,12 +1372,12 @@ def _get_logits(
             logits,
         )
 
-        # batch_size=self.batch_mlength_list[0]
+        # batch_size=self.batch_mlen_stage_lst[0]
         # _apply_lora_triton(hidden_states, self.lora_a_stacked, self.lora_b_stacked,
         #                    self.b_seq_start_tensor[:batch_size],
         #                    self.seq_length_tensor[:batch_size],
         #                    self.indices[:self.indices_len[1]],
-        #                    self.batch_mlength_list, logits_temp)
+        #                    self.batch_mlen_stage_lst, logits_temp)
         # flag=torch.allclose(logits_temp,logits,rtol=1e-2,atol=1e-2)
         # if flag:
         #     print("pass")
@@ -1517,14 +1458,11 @@ def set_lora(
         ...
 
     def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
+            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
+            sampler_indices_padded: torch.Tensor,
+            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
+            indices_len: List[int], seq_length_tensor: torch.Tensor,
+            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
         self.long_lora_indices = long_lora_indices
         self.indices_len = indices_len
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 96e2e51bd93e..1cdc3a03b8bf 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -20,6 +20,12 @@
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.utils import LRUCache, is_pin_memory_available
 
+# NOTE: The number of _MAX_BATCHS derived from worker's model_runner.
+# _BATCH_SIZES_TO_CAPTURE.It needs to be updated if _BATCH_SIZES_TO_CAPTURE
+# is changed.
+
+_MAX_BATCHS = 256+16 #max(_BATCH_SIZES_TO_CAPTURE)+16
+
 logger = init_logger(__name__)
 
 _GLOBAL_LORA_ID = 0
@@ -83,7 +89,7 @@ def convert_mapping(
     """
     index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
     embedding_indices = index_mapping_indices.copy()
-    lora_indices = index_mapping_indices.copy()
+    lora_indices = mapping.batch_mapping.copy()
     long_lora_offsets: Optional[torch.Tensor] = None
     if long_lora_context:
         long_lora_offsets = torch.zeros(len(index_mapping_indices),
@@ -93,22 +99,27 @@ def convert_mapping(
         lora_index_to_id.index(x) if x > 0 else -1
         for x in mapping.prompt_mapping
     ]
-    lora_idx = None
+    token_lora_idx = None
     for i in range(len(index_mapping_indices)):
         # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
-        lora_indices[i] = lora_idx
+        token_lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                          if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[
+            i] = token_lora_idx if index_mapping_indices[i] > 0 else 0
         if long_lora_context:
             assert long_lora_offsets is not None
             lora_offset: int = long_lora_context.offsets_by_lora_id.get(
                 index_mapping_indices[i], 0)
             long_lora_offsets[i] = lora_offset
+    # every seq lora_id
+    for i in range(len(lora_indices)):
+        lora_indices[i] = (lora_index_to_id.index(lora_indices[i])
+                           if lora_indices[i] > 0 else -1)
 
     indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices, lora_indices, embedding_indices
+        index_mapping_indices, embedding_indices
     ]
+    base_indices = torch.tensor(lora_indices, dtype=torch.long, device="cuda")
     if long_lora_context:
         assert long_lora_offsets is not None
         indices_list.append(long_lora_offsets)
@@ -117,11 +128,11 @@ def convert_mapping(
                                          device="cuda",
                                          dtype=torch.long)
     embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size)
+        indices[1] * extra_vocab_size,
+        indices[1] * (vocab_size + extra_vocab_size)
     ])
     embeddings_indices[embeddings_indices == -1] = max_loras - 1
-    base_indices = indices[1]
+
     sampler_indices = prompt_mapping_tensor
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
@@ -132,7 +143,7 @@ def convert_mapping(
     long_lora_indices = None
     long_lora_indices_len: Optional[int] = None
     if long_lora_context:
-        long_lora_indices = indices[3]
+        long_lora_indices = indices[2]
         long_lora_indices_len = long_lora_indices.shape[-1]
     # Contain length of indices tensors. Used to index into each tensor.
     indices_len = [
@@ -400,6 +411,7 @@ def __init__(
                                               self.max_num_batched_tokens,
                                               dtype=torch.long,
                                               device="cuda")
+
         self.long_lora_indices = torch.empty(self.max_num_batched_tokens,
                                              dtype=torch.long,
                                              device="cuda")
@@ -429,17 +441,19 @@ def __init__(
         self._last_mapping: Optional[LoRAMapping] = None
 
         # triton kernel mapping
-
-        self.batch_mlength_lst = [-1] * 2
-        self.seq_length_tensor = torch.empty(self.max_num_batched_tokens,
+        self.seq_length_tensor = torch.empty(_MAX_BATCHS,
                                              dtype=torch.long,
                                              device="cuda")
-        self.b_seq_start_tensor = torch.zeros(self.max_num_batched_tokens,
+        self.b_seq_start_tensor = torch.zeros(_MAX_BATCHS,
                                               dtype=torch.long,
                                               device="cuda")
-        self.lora_index_tensor = torch.empty(self.max_num_batched_tokens,
-                                             dtype=torch.long,
-                                             device="cuda")
+
+        # element contains batch_size, max_length, 0 or 1. Use 1 for the 
+        # prefilling stage and 0 for the decoding stage.The reason for 
+        # distinguishing between the prefilling and decoding stage is that 
+        # if we have implemented bgmv, it can be utilized during the decoding 
+        # stage.
+        self.batch_mlen_stage_lst = [-1] * 3
         self._create_lora_modules()
         self.model.lora_manager = self
 
@@ -561,35 +575,23 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         # Maintain the reference
         self.indices_len[:] = indices_len
 
-        if mapping.seq_lens:
+        # Mapping for sgmv kernel
+        if mapping.seq_lens and mapping.batch_mapping:
             batchs = len(mapping.seq_lens)
             seq_length_tensor = torch.tensor(mapping.seq_lens,
                                              dtype=torch.long,
                                              device="cuda")
             self.seq_length_tensor[:batchs].copy_(seq_length_tensor)
-            # b_seq_start_tensor = torch.zeros(seq_length_tensor.shape[0] + 1,
-            #                                  dtype=torch.long,
-            #                                  device="cuda")
-            # torch.cumsum(seq_length_tensor,
-            #              dim=0,
-            #              dtype=seq_length_tensor.dtype,
-            #              out=b_seq_start_tensor[1:])
-            torch.cumsum(seq_length_tensor,
-                         dim=0,
-                         dtype=seq_length_tensor.dtype,
-                         out=self.b_seq_start_tensor[1:])
-            # self.b_seq_start_tensor[:batchs].copy_(b_seq_start_tensor)
-            lora_id_lst = []
-            for lora_index in mapping.batch_mapping:
-                lora_id_lst.append(
-                    self.lora_index_to_id.index(lora_index
-                                                ) if lora_index > 0 else -1)
-            lora_id_tensor = torch.tensor(lora_id_lst,
-                                          dtype=torch.long,
-                                          device="cuda")
-            self.lora_index_tensor[:lora_id_tensor.size(0)].copy_(
-                lora_id_tensor)
-            self.batch_mlength_lst[:] = [batchs, max(mapping.seq_lens)]
+            temp_tensor=torch.cumsum(
+                seq_length_tensor,
+                dim=0,
+                dtype=seq_length_tensor.dtype)
+            self.b_seq_start_tensor[1:temp_tensor.size(0)+1].copy_(temp_tensor)
+            
+            self.batch_mlen_stage_lst[:] = [
+                batchs,
+                max(mapping.seq_lens), 1 if mapping.is_prefilling else 0
+            ]
 
     def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
         if self._last_mapping != lora_mapping:
@@ -642,11 +644,10 @@ def _create_lora_modules(self):
             new_module.set_mapping(self.base_indices, self.sampler_indices,
                                    self.sampler_indices_padded,
                                    self.embeddings_indices,
-                                   self.long_lora_indices, self.indices_len)
-            new_module.set_kernel_mapping(self.seq_length_tensor,
-                                          self.b_seq_start_tensor,
-                                          self.lora_index_tensor,
-                                          self.batch_mlength_lst)
+                                   self.long_lora_indices, self.indices_len,
+                                   self.seq_length_tensor,
+                                   self.b_seq_start_tensor,
+                                   self.batch_mlen_stage_lst)
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA)
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index c68c551db89e..f2af7be4ad62 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -1,3 +1,10 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
 import torch
 import triton
 import triton.language as tl
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index b0bf8015431e..72ed81bcbbd3 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -1,3 +1,10 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
 import torch
 import triton
 import triton.language as tl
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index b8d0d8a23c8c..b5b0569b54d3 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -1,3 +1,10 @@
+
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
 import torch
 import triton
 import triton.language as tl

From 2dfeb97c9d1e15ba1f5b78187f82c71d2f2ecb63 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 29 May 2024 19:19:17 +0800
Subject: [PATCH 07/71] optimize code

---
 vllm/lora/ops/sgmv_shrink.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index b5b0569b54d3..2727efbd57b6 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -50,6 +50,8 @@ def _sgmv_shrink_kernel(
     if pid_m * BLOCK_M > M:
         return
     lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
     cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
     offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N

From 5e55ab8d69219e7d9567f171b3510871b522c95e Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 30 May 2024 07:42:04 +0800
Subject: [PATCH 08/71] add bgmv

---
 vllm/lora/ops/bgmv_expand.py       | 156 ++++++++++++++++++++++++++
 vllm/lora/ops/bgmv_expand_slice.py | 169 +++++++++++++++++++++++++++++
 vllm/lora/ops/bgmv_shrink.py       | 139 ++++++++++++++++++++++++
 vllm/lora/ops/sgmv_expand_slice.py |   2 +-
 vllm/lora/ops/sgmv_shrink.py       |  46 +++++---
 5 files changed, 496 insertions(+), 16 deletions(-)
 create mode 100644 vllm/lora/ops/bgmv_expand.py
 create mode 100644 vllm/lora/ops/bgmv_expand_slice.py
 create mode 100644 vllm/lora/ops/bgmv_shrink.py

diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
new file mode 100644
index 000000000000..19c8d511ff9c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -0,0 +1,156 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import triton
+import triton.language as tl
+import torch
+
+
+@triton.jit
+def _bgmv_expand_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    lora_indices,
+    xm_stride,
+    xk_stride,
+    l0_stride,
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    pid_n = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = tl.arange(0, BLOCK_K)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    # a_ptr = input_ptr + cur_batch * xm_stride + offset_k[None, :] * xk_stride
+    a_ptr = input_ptr + cur_batch * xm_stride + offset_k[:,None] * xk_stride
+    b_ptr = (
+        lora_ptr
+        + l0_stride * lora_index
+        + rbn[None, :] * lora_k_stride
+        + offset_k[:, None] * lora_n_stride
+    )
+    accumulator = tl.zeros((1, BLOCK_N), dtype=lora_ptr.dtype.element_ty)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            k_remaining = K - k * BLOCK_K
+            tiled_a = tl.load(
+                a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0
+            )
+            tiled_b = tl.load(
+                b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0
+            )
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+        accumulator += tl.sum(tiled_a[None, :] * tiled_b, 1)
+        a_ptr += BLOCK_K * xk_stride
+        b_ptr += BLOCK_K * lora_n_stride
+
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride
+    c_mask = offset_cn[None, :] < N
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        accumulator += tiled_out
+    tl.store(c_ptr, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batchs: int,
+    add_inputs: bool = False,
+):
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batchs (int): batch size
+        add_inputs (bool, optional): _description_. Defaults to False.
+        cast_type (bool, optional): _description_. Defaults to False.
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+
+    assert lora_indices_tensor.size(0) == batchs
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+    BLOCK_N = 32
+    BLOCK_K = 16
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    grid = [
+        triton.cdiv(N, BLOCK_N),
+        batchs,
+    ]
+    _bgmv_expand_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+    )
+    return
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
new file mode 100644
index 000000000000..0404f2383d10
--- /dev/null
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -0,0 +1,169 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import triton
+import triton.language as tl
+import torch
+
+@triton.jit
+def _bgmv_expand_slice_kernel(
+    input_ptr, 
+    lora_ptr, 
+    out_ptr,  
+    N,
+    K,
+    lora_indices,  
+    xm_stride, 
+    xk_stride, 
+    l0_stride,  
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    slice_offset,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+):
+    pid_n = tl.program_id(axis=0)
+    pid_sk = tl.program_id(axis=1)
+    cur_batch = tl.program_id(axis=2)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = tl.arange(0, BLOCK_K)
+    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    a_ptr = input_ptr + cur_batch * xm_stride + offset_k[None, :] * xk_stride
+    b_ptr = (
+        lora_ptr
+        + l0_stride * lora_index
+        + rbn[None, :] * lora_k_stride
+        + offset_k[:, None] * lora_n_stride
+    )
+    accumulator = tl.zeros((1, BLOCK_N), dtype=lora_ptr.dtype.element_ty)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            k_remaining = K - k * BLOCK_K
+            tiled_a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0)
+            tiled_b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0)
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+        accumulator += tl.sum(tiled_a[None, :] * tiled_b, 1)
+        a_ptr += BLOCK_K * xk_stride
+        b_ptr += BLOCK_K * lora_n_stride
+
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N+slice_offset
+    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride
+    c_mask = offset_cn[None, :] < (slice_offset+N)
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        accumulator += tiled_out
+    tl.store(c_ptr, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batchs: int,
+    max_seq_length: int,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = False,
+):
+    """_summary_
+
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4, 10].
+        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
+            length of the sequences  in the batch
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batchs (int): batch size
+        max_seq_length (int):  The max sequence lengths of the sequences
+            in the batch
+        slice_offst (int): output_tensor's offst
+        slice_size (int): current output_tensor's size
+        add_inputs (bool, optional): _description_. Defaults to False.
+    """
+
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert lora_b_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    assert inputs.size(1) == lora_b_weights.size(-1)
+    assert lora_indices_tensor.size(0) == batchs
+    assert slice_size == lora_b_weights.size(-2)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
+        assert lora_b_weights.size(1) == 1
+        lora_b_weights = lora_b_weights.squeeze(dim=1)
+    else:
+        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
+
+    assert lora_b_weights.is_contiguous()
+
+    # TODO tuning this config
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
+
+    BLOCK_M = 32
+    BLOCK_N = 32
+    BLOCK_K = 16
+    EVEN_K = K % BLOCK_K == 0
+    ADD_INPUTS = add_inputs
+    CAST_TYPE = False
+    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+    grid = [
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        batchs,
+    ]
+    _bgmv_expand_slice_kernel[grid](
+        inputs,
+        lora_b_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_b_weights.stride(0),
+        lora_b_weights.stride(1),
+        lora_b_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        slice_offset,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+    )
+    return
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
new file mode 100644
index 000000000000..eeeff502eb5b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -0,0 +1,139 @@
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+import triton
+import triton.language as tl
+import torch
+
+@triton.jit
+def _bgmv_shrink_kernel(
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    lora_indices,
+    scaling,
+    xm_stride,
+    xk_stride,
+    l0_stride,
+    lora_k_stride,
+    lora_n_stride,
+    cm_stride,
+    cn_stride,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
+    pid_n = tl.program_id(axis=0)
+    pid_sk = tl.program_id(axis=1)
+    cur_batch = tl.program_id(axis=2)
+    lora_index = tl.load(lora_indices + cur_batch)
+    if lora_index == -1:
+        return
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    a_ptr = input_ptr + cur_batch * xm_stride + offset_k[:,None] * xk_stride
+    b_ptr = (
+        lora_ptr
+        + l0_stride * lora_index
+        + rbn[None, :] * lora_k_stride
+        + offset_k[:, None] * lora_n_stride
+    )
+    accumulator = tl.zeros((1,BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            k_remaining = K - k * (BLOCK_K * SPLIT_K)
+            tiled_a = tl.load(
+                a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0
+            )
+            tiled_b = tl.load(
+                b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0
+            )
+        accumulator += tl.sum(tiled_a[None,:] * tiled_b, 1)
+        a_ptr += BLOCK_K * SPLIT_K * xk_stride
+        b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
+    accumulator *= scaling
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride
+    c_mask = offset_cn[None, :] < N
+    if SPLIT_K:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+
+
+@torch.inference_mode()
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batchs: int,
+    scaling: float,
+):
+    """
+
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_a_weights (torch.Tensor): lora'a weight
+        output_tensor (torch.Tensor): output tensor
+        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batchs (int): batch size
+        scaling (float):  Scaling factor.
+    """
+    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert lora_indices_tensor.size(0) == batchs
+    assert inputs.is_contiguous()
+
+    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
+        assert lora_a_weights.size(1) == 1
+        lora_a_weights = lora_a_weights.squeeze(dim=1)
+    else:
+        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
+    assert lora_a_weights.is_contiguous()
+    assert output_tensor.is_contiguous()
+    # TODO tuning this config
+    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
+    BLOCK_N = 16
+    BLOCK_K = 32
+    SPLIT_K = 1
+    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
+    grid = [
+        triton.cdiv(N, BLOCK_N),
+        SPLIT_K,
+        batchs,
+    ]
+    _bgmv_shrink_kernel[grid](
+        inputs,
+        lora_a_weights,
+        output_tensor,
+        N,
+        K,
+        lora_indices_tensor,
+        scaling,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_a_weights.stride(0),
+        lora_a_weights.stride(1),
+        lora_a_weights.stride(2),
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+    )
+    return
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 72ed81bcbbd3..41e65d2a15d4 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -60,7 +60,7 @@ def _sgmv_expand_slice_kernel(
     offset_k = tl.arange(0, BLOCK_K)
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-
+    
     a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
              offset_k[None, :] * xk_stride, )
     b_ptr = (lora_ptr + l0_stride * lora_index +
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 2727efbd57b6..6a94aedde9d5 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -1,10 +1,10 @@
-
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
 Punica: Multi-Tenant LoRA Serving. 
 https://arxiv.org/abs/2310.18547
 """
+
 import torch
 import triton
 import triton.language as tl
@@ -60,31 +60,47 @@ def _sgmv_shrink_kernel(
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
 
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride)
-    b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
-             offset_k[:, None] * lora_n_stride)
+    a_ptr = (
+        input_ptr
+        + cur_seq_start * xm_stride
+        + ram[:, None] * xm_stride
+        + offset_k[None, :] * xk_stride
+    )
+    b_ptr = (
+        lora_ptr
+        + l0_stride * lora_index
+        + rbn[None, :] * lora_k_stride
+        + offset_k[:, None] * lora_n_stride
+    )
 
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
         if EVEN_K:
-            a = tl.load(a_ptr)
-            b = tl.load(b_ptr)
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
         else:
             k_remaining = K - k * (BLOCK_K * SPLIT_K)
-            a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0)
-            b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0)
-        accumulator += tl.dot(a, b)
+            tiled_a = tl.load(
+                a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0
+            )
+            tiled_b = tl.load(
+                b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0
+            )
+        accumulator += tl.dot(tiled_a, tiled_b)
 
         a_ptr += BLOCK_K * SPLIT_K * xk_stride
         b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
 
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    c_ptr = (
+        out_ptr
+        + offset_cm[:, None] * cm_stride
+        + offset_cn[None, :] * cn_stride
+    )
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
+        offset_cn[None, :] < N
+    )
     accumulator *= scaling
     # handles write-back with reduction-splitting
     if SPLIT_K == 1:
@@ -159,7 +175,7 @@ def sgmv_shrink(
         b_seq_start_loc,
         seq_len_tensor,
         lora_indices_tensor,
-        scaling,
+        scaling, 
         inputs.stride(0),
         inputs.stride(1),
         lora_a_weights.stride(0),

From 79c07ab225deb441d3dd45aee10eaa5d42977470 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 31 May 2024 19:29:53 +0800
Subject: [PATCH 09/71] modify bgmv

---
 tests/lora/test_triton_punica.py   | 492 +++++++++++++++++++++++++++++
 vllm/lora/ops/bgmv_expand.py       |  84 +++--
 vllm/lora/ops/bgmv_expand_slice.py | 108 +++----
 vllm/lora/ops/bgmv_shrink.py       |  84 +++--
 vllm/lora/ops/sgmv_shrink.py       |  45 ++-
 5 files changed, 655 insertions(+), 158 deletions(-)
 create mode 100644 tests/lora/test_triton_punica.py

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
new file mode 100644
index 000000000000..74bab70f1aad
--- /dev/null
+++ b/tests/lora/test_triton_punica.py
@@ -0,0 +1,492 @@
+import random
+
+import pytest
+import torch
+
+import vllm.lora.punica as punica
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+
+# The current punica kernel supports dimension and adds a dimension of 3424.
+HIDDEN_SIZES = [
+    128,
+    256,
+    512,
+    1024,
+    1152,
+    1280,
+    1536,
+    2048,
+    2304,
+    2560,
+    2752,
+    3072,
+    3424,
+    3456,
+    3584,
+    4096,
+    4608,
+    5120,
+    5504,
+    5632,
+    6144,
+    6848,
+    6912,
+    7168,
+    8192,
+    9216,
+    10240,
+    11008,
+    13824,
+    14336,
+    15360,
+    22016,
+    24576,
+    27392,
+    27648,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+BATCHS = [i for i in range(0, 64, 8)]
+NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256]
+DTYPES = [torch.half, torch.bfloat16, torch.float32]
+MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
+SCALES = [0.5]
+OP_TYPES = ["shrink", "expand"]
+SEED = [0]
+CUDA_DEVICES = [f"cuda:{0}"]
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (1e-2, 1e-2),
+        torch.bfloat16: (12e-2, 1e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+@torch.inference_mode()
+def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling):
+    layer_idx = 0
+    punica.bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, scaling)
+    return
+
+
+def _torch_groupgemm(
+    out_tensor,
+    inputs,
+    lora_weights,
+    lora_indices_tensor,
+    seq_len_tensor,
+    batchs,
+    scaling,
+    op_type,
+) -> torch.Tensor:
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batchs), seq_len_tensor):
+        input_weight = inputs[current_offset : b_length + current_offset, :]
+        current_offset += b_length
+        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
+        result = torch.nn.functional.linear(input_weight, lora_weight)
+        result *= scaling
+        out_list.append(result)
+    cat_result = torch.cat(out_list, dim=0)
+    if op_type == "expand":
+        out_tensor += cat_result
+    else:
+        out_tensor.copy_(cat_result)
+    return
+
+
+def _generate_data(
+    batchs, hidden_size, lora_nums, max_rank, max_length, dtype, op_type, device
+):
+    if max_length == 1:
+        max_length += 1
+    seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    if op_type == "shrink":
+        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(
+            device
+        )
+        lora_weights = torch.rand(
+            (lora_nums, max_rank, hidden_size),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # shrink op need atomic_add, so output is initinized by 0
+        ref_out_tensor = torch.zeros(
+            (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
+        )
+        # NOTE  shrink kernel using torch.float32 as output type
+        our_out_tensor = torch.zeros(
+            (total_tokens, max_rank),
+            dtype=torch.float32,
+            device=inputs_tensor.device,
+        )
+    else:
+        inputs_tensor = torch.rand(
+            (total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, hidden_size, max_rank),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        ref_out_tensor = torch.rand(
+            (total_tokens, hidden_size),
+            dtype=dtype,
+            device=inputs_tensor.device,
+        )
+        # Ensure the same input.
+        our_out_tensor = ref_out_tensor.clone()
+
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,)
+    ).to(device)
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batchs):
+        lora_index = lora_indices_tensor[b_id]
+        indices[
+            current_offset : current_offset + seq_len_tensor[b_id]
+        ] = lora_index.item()
+        current_offset += seq_len_tensor[b_id].item()
+    return (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+@pytest.mark.skip("work in progress")
+@pytest.mark.parametrize("batchs", BATCHS)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", OP_TYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sgmv_torch(
+    batchs: int,
+    num_loras: int,
+    rank: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    torch.manual_seed(seed)
+    if batchs == 0:
+        batchs += 1
+    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
+    hidden_size = HIDDEN_SIZES[hidden_size_index]
+    if hidden_size > 100000:
+        hidden_size = hidden_size // 4  # avoid OOM
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = _generate_data(
+        batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, device
+    )  # The sequence length is restricted to the range [1, 1024].
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            add_inputs=True,
+        )
+    _torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batchs,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.skip("work in progress")
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", OP_TYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_triton_sgmv_punica_bgmv(
+    hidden_size,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
+    if dtype == torch.float32 or hidden_size == 3424:
+        return
+    torch.manual_seed(seed)
+    batchs = 4  # Arbitrary values for testing
+    rank = 16
+    seq_len = 333  # Arbitrary values for testing
+    num_loras = 8  # Arbitrary values for testing
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = _generate_data(
+        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
+    )
+
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            add_inputs=True,
+        )
+    lora_weights_4d = lora_weights.unsqueeze(dim=1)
+    _punica_bgmv(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights_4d,
+        indices,
+        scaling if op_type == "shrink" else 1.0,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batchs", BATCHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", OP_TYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_triton_bgmv_punica_bgmv(
+    batchs: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
+    if dtype == torch.float32 or hidden_size == 3424:
+        return
+    torch.manual_seed(seed)
+    if batchs == 0:
+        batchs += 1
+    rank = 16
+    seq_len = 1  #
+    num_loras = 8  # Arbitrary values for testing
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = _generate_data(
+        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
+    )
+
+    if op_type == "shrink":
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            lora_indices_tensor,
+            batchs,
+            scaling,
+        )
+    else:
+        bgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            lora_indices_tensor,
+            batchs,
+            add_inputs=True,
+        )
+    lora_weights_4d = lora_weights.unsqueeze(dim=1)
+    _punica_bgmv(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights_4d,
+        indices,
+        scaling if op_type == "shrink" else 1.0,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.skip("work in progress")
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sgmv_expand_nslice(
+    hidden_size,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
+    if dtype == torch.float32 or hidden_size == 3424:
+        return
+    torch.manual_seed(seed)
+    batchs = 4  # Arbitrary values for testing
+    rank = 16
+    seq_len = 333  # Arbitrary values for testing
+    num_loras = 8  # Arbitrary values for testing
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = _generate_data(
+        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
+    )
+
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+
+    sgmv_expand_slice(
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        b_seq_start_loc,
+        seq_len_tensor,
+        lora_indices_tensor,
+        batchs,
+        max_seq_length,
+        1024,
+        add_inputs=True,
+    )
+    lora_weights_4d = lora_weights.unsqueeze(dim=1)
+    _punica_bgmv(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights_4d,
+        indices,
+        scaling if op_type == "shrink" else 1.0,
+    )
+
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+if __name__ == "__main__":
+    test_triton_bgmv_punica_bgmv(
+        batchs=1,
+        hidden_size=128,
+        scaling=0.5,
+        dtype=torch.float16,
+        op_type="expand",
+        seed=0,
+        device="cuda:0",
+    )
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 19c8d511ff9c..7762276b65ce 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -27,52 +27,51 @@ def _bgmv_expand_kernel(
     cn_stride,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
     ADD_INPUTS: tl.constexpr,
     CAST_TYPE: tl.constexpr,
 ):
-    pid_n = tl.program_id(axis=0)
-    cur_batch = tl.program_id(axis=1)
+    """
+    C=A@B, and B is col-major matrix
+    """
+    cur_batch = tl.program_id(axis=0)
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
-    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     offset_k = tl.arange(0, BLOCK_K)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-    # a_ptr = input_ptr + cur_batch * xm_stride + offset_k[None, :] * xk_stride
-    a_ptr = input_ptr + cur_batch * xm_stride + offset_k[:,None] * xk_stride
-    b_ptr = (
-        lora_ptr
-        + l0_stride * lora_index
-        + rbn[None, :] * lora_k_stride
-        + offset_k[:, None] * lora_n_stride
-    )
-    accumulator = tl.zeros((1, BLOCK_N), dtype=lora_ptr.dtype.element_ty)
-    for k in range(0, tl.cdiv(K, BLOCK_K)):
-        if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
-        else:
-            k_remaining = K - k * BLOCK_K
-            tiled_a = tl.load(
-                a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0
-            )
-            tiled_b = tl.load(
-                b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0
-            )
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-        accumulator += tl.sum(tiled_a[None, :] * tiled_b, 1)
-        a_ptr += BLOCK_K * xk_stride
-        b_ptr += BLOCK_K * lora_n_stride
+    offset_n = tl.arange(0, BLOCK_N)
+    # tl.max_contiguous(offset_k, BLOCK_K)
+    tiled_a = tl.load(
+        input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+        mask=offset_k < K,
+        other=0,
+    )  # [BLOCK_K]
+    b_ptr = lora_ptr + l0_stride * lora_index
+    if CAST_TYPE:
+        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+    # sliding  to  next row-block
+
+    for n in range(0, N, BLOCK_N):
+        current_n = n + offset_n
+        # vector load
+        current_n_c = tl.max_contiguous(current_n, BLOCK_N)
+        b_ptr_mask = (current_n[:, None] < N) & (offset_k[None, :] < K)
+
+        tiled_b = tl.load(
+            b_ptr
+            + current_n_c[:, None] * lora_k_stride
+            + offset_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
+
+        accumulator = tl.sum(tiled_a * tiled_b, 1)
 
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride
-    c_mask = offset_cn[None, :] < N
-    if ADD_INPUTS:
-        tiled_out = tl.load(c_ptr, mask=c_mask)
-        accumulator += tiled_out
-    tl.store(c_ptr, accumulator, mask=c_mask)
+        c_ptr = out_ptr + cur_batch * cm_stride + current_n * cn_stride
+        c_mask = current_n < N
+        if ADD_INPUTS:
+            tiled_out = tl.load(c_ptr, mask=c_mask)
+            accumulator += tiled_out
+        tl.store(c_ptr, accumulator, mask=c_mask)
 
 
 @torch.inference_mode()
@@ -119,9 +118,8 @@ def bgmv_expand(
     # TODO tuning this config
 
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_N = 32
-    BLOCK_K = 16
-    EVEN_K = K % BLOCK_K == 0
+    BLOCK_N = 512 
+    BLOCK_K = triton.next_power_of_2(K)
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
@@ -130,9 +128,9 @@ def bgmv_expand(
     ]:
         CAST_TYPE = True
     grid = [
-        triton.cdiv(N, BLOCK_N),
         batchs,
     ]
+    config = {"num_stages": 4, "num_warps": 8}
     _bgmv_expand_kernel[grid](
         inputs,
         lora_b_weights,
@@ -149,8 +147,8 @@ def bgmv_expand(
         output_tensor.stride(1),
         BLOCK_N,
         BLOCK_K,
-        EVEN_K,
         ADD_INPUTS,
         CAST_TYPE,
+        **config,
     )
     return
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 0404f2383d10..a197f5eddb8b 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -9,17 +9,18 @@
 import triton.language as tl
 import torch
 
+
 @triton.jit
 def _bgmv_expand_slice_kernel(
-    input_ptr, 
-    lora_ptr, 
-    out_ptr,  
+    input_ptr,
+    lora_ptr,
+    out_ptr,
     N,
     K,
-    lora_indices,  
-    xm_stride, 
-    xk_stride, 
-    l0_stride,  
+    lora_indices,
+    xm_stride,
+    xk_stride,
+    l0_stride,
     lora_k_stride,
     lora_n_stride,
     cm_stride,
@@ -27,49 +28,56 @@ def _bgmv_expand_slice_kernel(
     slice_offset,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
     ADD_INPUTS: tl.constexpr,
     CAST_TYPE: tl.constexpr,
 ):
-    pid_n = tl.program_id(axis=0)
-    pid_sk = tl.program_id(axis=1)
-    cur_batch = tl.program_id(axis=2)
+    """
+    C=A@B, and B is col-major matrix
+    """
+    cur_batch = tl.program_id(axis=0)
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
-    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     offset_k = tl.arange(0, BLOCK_K)
-    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-    a_ptr = input_ptr + cur_batch * xm_stride + offset_k[None, :] * xk_stride
-    b_ptr = (
-        lora_ptr
-        + l0_stride * lora_index
-        + rbn[None, :] * lora_k_stride
-        + offset_k[:, None] * lora_n_stride
-    )
-    accumulator = tl.zeros((1, BLOCK_N), dtype=lora_ptr.dtype.element_ty)
-    for k in range(0, tl.cdiv(K, BLOCK_K)):
-        if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
-        else:
-            k_remaining = K - k * BLOCK_K
-            tiled_a = tl.load(a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0)
-            tiled_b = tl.load(b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0)
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-        accumulator += tl.sum(tiled_a[None, :] * tiled_b, 1)
-        a_ptr += BLOCK_K * xk_stride
-        b_ptr += BLOCK_K * lora_n_stride
+    offset_n = tl.arange(0, BLOCK_N)
+    # tl.max_contiguous(offset_k, BLOCK_K)
+    tiled_a = tl.load(
+        input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+        mask=offset_k < K,
+        other=0,
+    )  # [BLOCK_K]
+    b_ptr = lora_ptr + l0_stride * lora_index
+    if CAST_TYPE:
+        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+    # sliding  to  next row-block
+
+    for n in range(0, N, BLOCK_N):
+        current_n = n + offset_n
+        # vector load
+        current_n_c = tl.max_contiguous(current_n, BLOCK_N)
+        b_ptr_mask = (current_n[:, None] < N) & (offset_k[None, :] < K)
+
+        tiled_b = tl.load(
+            b_ptr
+            + current_n_c[:, None] * lora_k_stride
+            + offset_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
 
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N+slice_offset
-    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride
-    c_mask = offset_cn[None, :] < (slice_offset+N)
-    if ADD_INPUTS:
-        tiled_out = tl.load(c_ptr, mask=c_mask)
-        accumulator += tiled_out
-    tl.store(c_ptr, accumulator, mask=c_mask)
+        accumulator = tl.sum(tiled_a * tiled_b, 1)
+
+        c_ptr = (
+            out_ptr
+            + cur_batch * cm_stride
+            + slice_offset  # slice size
+            + current_n * cn_stride
+        )
+        c_mask = current_n < N
+        if ADD_INPUTS:
+            tiled_out = tl.load(c_ptr, mask=c_mask)
+            accumulator += tiled_out
+        tl.store(c_ptr, accumulator, mask=c_mask)
 
 
 @torch.inference_mode()
@@ -126,22 +134,18 @@ def bgmv_expand_slice(
 
     assert lora_b_weights.is_contiguous()
 
-    # TODO tuning this config
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-
-    BLOCK_M = 32
-    BLOCK_N = 32
-    BLOCK_K = 16
-    EVEN_K = K % BLOCK_K == 0
+    # TODO tuning this config
+    BLOCK_N = 512
+    BLOCK_K = triton.next_power_of_2(K)
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-            torch.float16,
-            torch.bfloat16,
+        torch.float16,
+        torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = [
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         batchs,
     ]
     _bgmv_expand_slice_kernel[grid](
@@ -159,10 +163,8 @@ def bgmv_expand_slice(
         output_tensor.stride(0),
         output_tensor.stride(1),
         slice_offset,
-        BLOCK_M,
         BLOCK_N,
         BLOCK_K,
-        EVEN_K,
         ADD_INPUTS,
         CAST_TYPE,
     )
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index eeeff502eb5b..ac61c9d50bda 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -9,6 +9,7 @@
 import triton.language as tl
 import torch
 
+
 @triton.jit
 def _bgmv_shrink_kernel(
     input_ptr,
@@ -27,49 +28,44 @@ def _bgmv_shrink_kernel(
     cn_stride,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    SPLIT_K: tl.constexpr,
 ):
-    pid_n = tl.program_id(axis=0)
-    pid_sk = tl.program_id(axis=1)
-    cur_batch = tl.program_id(axis=2)
+    cur_batch = tl.program_id(axis=0)
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
-    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-    a_ptr = input_ptr + cur_batch * xm_stride + offset_k[:,None] * xk_stride
-    b_ptr = (
-        lora_ptr
-        + l0_stride * lora_index
-        + rbn[None, :] * lora_k_stride
-        + offset_k[:, None] * lora_n_stride
-    )
-    accumulator = tl.zeros((1,BLOCK_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
-        if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
-        else:
-            k_remaining = K - k * (BLOCK_K * SPLIT_K)
-            tiled_a = tl.load(
-                a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0
-            )
-            tiled_b = tl.load(
-                b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0
-            )
-        accumulator += tl.sum(tiled_a[None,:] * tiled_b, 1)
-        a_ptr += BLOCK_K * SPLIT_K * xk_stride
-        b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
+
+    offset_n = tl.arange(0, BLOCK_N)
+    offset_k = tl.arange(0, BLOCK_K)
+    a_ptr = input_ptr + cur_batch * xm_stride
+    b_ptr = lora_ptr + l0_stride * lora_index
+    rank_mask = offset_n[:, None] < N
+    accumulator = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    for k in range(0, K, BLOCK_K):
+        current_k = k + offset_k
+        # vector load
+        current_k_c = tl.max_contiguous(current_k, BLOCK_K)
+        tiled_a = tl.load(
+            a_ptr + current_k_c * xk_stride,
+            mask=current_k < K,
+            other=0.0,
+        )  # [BLOCK_K]
+        b_ptr_mask = (rank_mask < N) & (current_k[None, :] < K)
+
+        tiled_b = tl.load(
+            b_ptr
+            + offset_n[:, None] * lora_k_stride
+            + current_k[None, :] * lora_n_stride,
+            mask=b_ptr_mask,
+            other=0.0,
+        )  # [BLOCK_N,BLOCK_K]
+
+        accumulator += tl.sum(tiled_a * tiled_b, 1)
     accumulator *= scaling
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn[None, :] * cn_stride
-    c_mask = offset_cn[None, :] < N
-    if SPLIT_K:
-        tl.store(c_ptr, accumulator, mask=c_mask)
-    else:
-        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+    offset_cn = tl.arange(0, BLOCK_N)
+    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn * cn_stride
+    c_mask = offset_cn < N
+
+    tl.store(c_ptr, accumulator, mask=c_mask)
 
 
 @torch.inference_mode()
@@ -107,15 +103,12 @@ def bgmv_shrink(
     assert output_tensor.is_contiguous()
     # TODO tuning this config
     N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
-    BLOCK_N = 16
-    BLOCK_K = 32
-    SPLIT_K = 1
-    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
+    BLOCK_K = 512
+    BLOCK_N = triton.next_power_of_2(output_tensor.size(1))
     grid = [
-        triton.cdiv(N, BLOCK_N),
-        SPLIT_K,
         batchs,
     ]
+    config = {"num_stages": 4, "num_warps": 8}
     _bgmv_shrink_kernel[grid](
         inputs,
         lora_a_weights,
@@ -133,7 +126,6 @@ def bgmv_shrink(
         output_tensor.stride(1),
         BLOCK_N,
         BLOCK_K,
-        EVEN_K,
-        SPLIT_K,
+        **config,
     )
     return
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 6a94aedde9d5..65bf1a6a5d47 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -120,6 +120,7 @@ def sgmv_shrink(
     batchs: int,
     max_seq_length: int,
     scaling: float,
+    config: dict,
 ):
     """
 
@@ -156,16 +157,26 @@ def sgmv_shrink(
     assert output_tensor.is_contiguous()
     # TODO tuning this config
     N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
-    BLOCK_M = 32
-    BLOCK_N = 16
-    BLOCK_K = 32
-    SPLIT_K = 16
-    EVEN_K = False
-    grid = [
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        SPLIT_K,
-        batchs,
-    ]
+    # BLOCK_M = config.get("BLOCK_M", 32)
+    # BLOCK_N = config.get("BLOCK_N", 32)
+    # BLOCK_K = config.get("BLOCK_K", 32)
+    # SPLIT_K = config.get("SPLIT_K", 16)
+    # num_warps = config.get("num_warps", 4)
+    # num_stages = config.get("num_stages", 3)
+    # BLOCK_M = 32
+    # BLOCK_N = 16
+    # BLOCK_K = 32
+    # SPLIT_K = 16
+    EVEN_K = K %  config.get("BLOCK_K", 32) == 0
+    # grid = [
+    #     triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+    #     SPLIT_K,
+    #     batchs,
+    # ]
+    
+    grid = lambda META: (triton.cdiv(max_seq_length, META[
+        'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[
+        'SPLIT_K'],batchs)
     _sgmv_shrink_kernel[grid](
         inputs,
         lora_a_weights,
@@ -175,7 +186,7 @@ def sgmv_shrink(
         b_seq_start_loc,
         seq_len_tensor,
         lora_indices_tensor,
-        scaling, 
+        scaling,
         inputs.stride(0),
         inputs.stride(1),
         lora_a_weights.stride(0),
@@ -183,10 +194,12 @@ def sgmv_shrink(
         lora_a_weights.stride(2),
         output_tensor.stride(0),
         output_tensor.stride(1),
-        BLOCK_M,
-        BLOCK_N,
-        BLOCK_K,
-        EVEN_K,
-        SPLIT_K,
+        EVEN_K=EVEN_K,
+        **config
+        # BLOCK_M,
+        # BLOCK_N,
+        # BLOCK_K,
+        # EVEN_K,
+        # SPLIT_K,
     )
     return

From e2f56d5774e23227ab2f8578190665a62c23cfe1 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 31 May 2024 22:44:46 +0800
Subject: [PATCH 10/71] resolve conflict

---
 tests/lora/test_triton_sgmv.py | 396 ---------------------------------
 vllm/worker/model_runner.py    |   2 +-
 2 files changed, 1 insertion(+), 397 deletions(-)
 delete mode 100644 tests/lora/test_triton_sgmv.py

diff --git a/tests/lora/test_triton_sgmv.py b/tests/lora/test_triton_sgmv.py
deleted file mode 100644
index db3739f35d24..000000000000
--- a/tests/lora/test_triton_sgmv.py
+++ /dev/null
@@ -1,396 +0,0 @@
-import random
-
-import pytest
-import torch
-
-import vllm.lora.punica as punica
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
-
-# The current punica kernel supports dimension and adds a dimension of 3424.
-HIDDEN_SIZES = [
-    128,
-    256,
-    512,
-    1024,
-    1152,
-    1280,
-    1536,
-    2048,
-    2304,
-    2560,
-    2752,
-    3072,
-    3424,
-    3456,
-    3584,
-    4096,
-    4608,
-    5120,
-    5504,
-    5632,
-    6144,
-    6848,
-    6912,
-    7168,
-    8192,
-    9216,
-    10240,
-    11008,
-    13824,
-    14336,
-    15360,
-    22016,
-    24576,
-    27392,
-    27648,
-    32000,
-    32256,
-    32512,
-    32768,
-    33024,
-    36864,
-    43264,
-    49152,
-    64000,
-    64256,
-    102400,
-    102656,
-    128000,
-    128256,
-]
-BATCHS = [i for i in range(0, 64, 8)]
-NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256]
-DTYPES = [torch.half, torch.bfloat16, torch.float32]
-MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
-SCALES = [0.5]
-OP_TYPES = ["shrink", "expand"]
-SEED = [0]
-CUDA_DEVICES = [f"cuda:{0}"]
-
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (1e-2, 1e-2),
-        torch.bfloat16: (12e-2, 1e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-
-
-@torch.inference_mode()
-def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling):
-    layer_idx = 0
-    punica.bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, scaling)
-    return
-
-
-def _torch_groupgemm(
-    out_tensor,
-    inputs,
-    lora_weights,
-    lora_indices_tensor,
-    seq_len_tensor,
-    batchs,
-    scaling,
-    op_type,
-) -> torch.Tensor:
-    out_list = []
-    current_offset = 0
-    for lora_index, b_length in zip(range(batchs), seq_len_tensor):
-        input_weight = inputs[current_offset:b_length + current_offset, :]
-        current_offset += b_length
-        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
-        result = torch.nn.functional.linear(input_weight, lora_weight)
-        result *= scaling
-        out_list.append(result)
-    cat_result = torch.cat(out_list, dim=0)
-    if op_type == "expand":
-        out_tensor += cat_result
-    else:
-        out_tensor.copy_(cat_result)
-    return
-
-
-def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
-                   op_type, device):
-    seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device)
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-        dim=0,
-    ).to(device)
-    total_tokens = seq_len_tensor.sum()
-    if op_type == "shrink":
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
-        lora_weights = torch.rand(
-            (lora_nums, max_rank, hidden_size),  # col-major
-            dtype=dtype,
-        ).to(device)
-        # shrink op need atomic_add, so output is initinized by 0
-        ref_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=dtype,
-                                     device=inputs_tensor.device)
-        # NOTE  shrink kernel using torch.float32 as output type
-        our_out_tensor = torch.zeros(
-            (total_tokens, max_rank),
-            dtype=torch.float32,
-            device=inputs_tensor.device,
-        )
-    else:
-
-        inputs_tensor = torch.rand(
-            (total_tokens, max_rank),
-            dtype=dtype,
-        ).to(device)
-        lora_weights = torch.rand(
-            (lora_nums, hidden_size, max_rank),  # col-major
-            dtype=dtype,
-        ).to(device)
-        # expand op needs to complete y+=a@lora_b, so output is
-        # initinized randomly
-        ref_out_tensor = torch.rand(
-            (total_tokens, hidden_size),
-            dtype=dtype,
-            device=inputs_tensor.device,
-        )
-        # Ensure the same input.
-        our_out_tensor = ref_out_tensor.clone()
-
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batchs, )).to(device)
-    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
-    current_offset = 0
-    for b_id in range(batchs):
-        lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = lora_index.item()
-        current_offset += seq_len_tensor[b_id].item()
-    return (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    )
-
-
-@pytest.mark.parametrize("batchs", BATCHS)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", OP_TYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sgmv_torch(
-    batchs: int,
-    num_loras: int,
-    rank: int,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.manual_seed(seed)
-    if batchs == 0:
-        batchs += 1
-    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
-    hidden_size = HIDDEN_SIZES[hidden_size_index]
-    if hidden_size > 100000:
-        hidden_size = hidden_size // 4  # avoid OOM
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = _generate_data(
-        batchs, hidden_size, num_loras, rank, 1024, dtype, op_type,
-        device)  # The sequence length is restricted to the range [1, 1024].
-    max_seq_length = seq_len_tensor.max()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batchs,
-            max_seq_length,
-            scaling,
-        )
-    else:
-        sgmv_expand(inputs_tensor,
-                    lora_weights,
-                    our_out_tensor,
-                    b_seq_start_loc,
-                    seq_len_tensor,
-                    lora_indices_tensor,
-                    batchs,
-                    max_seq_length,
-                    add_inputs=True)
-    _torch_groupgemm(ref_out_tensor, inputs_tensor, lora_weights,
-                     lora_indices_tensor, seq_len_tensor, batchs,
-                     scaling if op_type == "shrink" else 1.0, op_type)
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", OP_TYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sgmv_punica_bgmv(
-    hidden_size,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
-    if dtype == torch.float32 or hidden_size == 3424:
-        return
-    torch.manual_seed(seed)
-    batchs = 4  # Arbitrary values for testing
-    rank = 16
-    seq_len = 333  # Arbitrary values for testing
-    num_loras = 8  # Arbitrary values for testing
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
-                       op_type, device)
-
-    max_seq_length = seq_len_tensor.max()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batchs,
-            max_seq_length,
-            scaling,
-        )
-    else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batchs,
-            max_seq_length,
-            add_inputs=True,
-        )
-    lora_weights_4d = lora_weights.unsqueeze(dim=1)
-    _punica_bgmv(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights_4d,
-        indices,
-        scaling if op_type == "shrink" else 1.0,
-    )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.skip("TODO")
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sgmv_expand_nslice(
-    hidden_size,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
-    if dtype == torch.float32 or hidden_size == 3424:
-        return
-    torch.manual_seed(seed)
-    batchs = 4  # Arbitrary values for testing
-    rank = 16
-    seq_len = 333  # Arbitrary values for testing
-    num_loras = 8  # Arbitrary values for testing
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
-                       op_type, device)
-
-    max_seq_length = seq_len_tensor.max()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-
-    sgmv_expand_slice(
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        b_seq_start_loc,
-        seq_len_tensor,
-        lora_indices_tensor,
-        batchs,
-        max_seq_length,
-        1024,
-        add_inputs=True,
-    )
-    lora_weights_4d = lora_weights.unsqueeze(dim=1)
-    _punica_bgmv(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights_4d,
-        indices,
-        scaling if op_type == "shrink" else 1.0,
-    )
-
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-if __name__ == "__main__":
-    pytest.main(["test_triton_sgmv.py::test_sgmv_expand_nslice"])
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index aaa8a66c40ab..a3e52a749fb6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -386,7 +386,7 @@ def _prepare_model_input(
                 if lora_id > 0:
                     lora_requests.add(seq_group_metadata.lora_request)
 
-                lora_index_mapping += [lora_id] * (seq_len - context_len)
+                lora_index_mapping += [lora_id] * query_len
                 batch_lora_index_mapping += [lora_id if lora_id > 0 else -1]
                 lora_prompt_mapping.extend(
                     [lora_id] *

From e0cb42b726c5dc06f4222f676ca301100e15ffbf Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 4 Jun 2024 15:18:25 +0800
Subject: [PATCH 11/71] optimize bgmv_shrink

---
 vllm/lora/ops/bgmv_shrink.py | 19 ++++++++++++------
 vllm/lora/ops/sgmv_shrink.py | 39 +++++++++++++++++-------------------
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index ac61c9d50bda..ed208796633a 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -28,24 +28,26 @@ def _bgmv_shrink_kernel(
     cn_stride,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
 ):
-    cur_batch = tl.program_id(axis=0)
+    pid_sk = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
 
     offset_n = tl.arange(0, BLOCK_N)
-    offset_k = tl.arange(0, BLOCK_K)
+    offset_k = tl.arange(0, BLOCK_K) + pid_sk * BLOCK_K
     a_ptr = input_ptr + cur_batch * xm_stride
     b_ptr = lora_ptr + l0_stride * lora_index
     rank_mask = offset_n[:, None] < N
     accumulator = tl.zeros((BLOCK_N,), dtype=tl.float32)
-    for k in range(0, K, BLOCK_K):
+    for k in range(0, K, BLOCK_K * SPLIT_K):
         current_k = k + offset_k
         # vector load
         current_k_c = tl.max_contiguous(current_k, BLOCK_K)
         tiled_a = tl.load(
-            a_ptr + current_k_c * xk_stride,
+            a_ptr + current_k_c,
             mask=current_k < K,
             other=0.0,
         )  # [BLOCK_K]
@@ -64,8 +66,10 @@ def _bgmv_shrink_kernel(
     offset_cn = tl.arange(0, BLOCK_N)
     c_ptr = out_ptr + cur_batch * cm_stride + offset_cn * cn_stride
     c_mask = offset_cn < N
-
-    tl.store(c_ptr, accumulator, mask=c_mask)
+    if SPLIT_K == 1:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
 
 
 @torch.inference_mode()
@@ -105,7 +109,9 @@ def bgmv_shrink(
     N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
     BLOCK_K = 512
     BLOCK_N = triton.next_power_of_2(output_tensor.size(1))
+    SPLIT_K = 16
     grid = [
+        SPLIT_K,
         batchs,
     ]
     config = {"num_stages": 4, "num_warps": 8}
@@ -126,6 +132,7 @@ def bgmv_shrink(
         output_tensor.stride(1),
         BLOCK_N,
         BLOCK_K,
+        SPLIT_K,
         **config,
     )
     return
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 65bf1a6a5d47..d27bcd15880b 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -120,7 +120,6 @@ def sgmv_shrink(
     batchs: int,
     max_seq_length: int,
     scaling: float,
-    config: dict,
 ):
     """
 
@@ -163,20 +162,20 @@ def sgmv_shrink(
     # SPLIT_K = config.get("SPLIT_K", 16)
     # num_warps = config.get("num_warps", 4)
     # num_stages = config.get("num_stages", 3)
-    # BLOCK_M = 32
-    # BLOCK_N = 16
-    # BLOCK_K = 32
-    # SPLIT_K = 16
-    EVEN_K = K %  config.get("BLOCK_K", 32) == 0
-    # grid = [
-    #     triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-    #     SPLIT_K,
-    #     batchs,
-    # ]
+    BLOCK_M = 32
+    BLOCK_N = 16
+    BLOCK_K = 32
+    SPLIT_K = 8
+    EVEN_K = K % BLOCK_K == 0
+    grid = [
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        SPLIT_K,
+        batchs,
+    ]
     
-    grid = lambda META: (triton.cdiv(max_seq_length, META[
-        'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[
-        'SPLIT_K'],batchs)
+    # grid = lambda META: (triton.cdiv(max_seq_length, META[
+    #     'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[
+    #     'SPLIT_K'],batchs)
     _sgmv_shrink_kernel[grid](
         inputs,
         lora_a_weights,
@@ -194,12 +193,10 @@ def sgmv_shrink(
         lora_a_weights.stride(2),
         output_tensor.stride(0),
         output_tensor.stride(1),
-        EVEN_K=EVEN_K,
-        **config
-        # BLOCK_M,
-        # BLOCK_N,
-        # BLOCK_K,
-        # EVEN_K,
-        # SPLIT_K,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
     )
     return

From 64416e071a5e12b5aa7170b28d277324f333517f Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 4 Jun 2024 18:44:45 +0800
Subject: [PATCH 12/71] optimize bgmv_expand

---
 vllm/lora/ops/bgmv_expand.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 7762276b65ce..aa572151fb41 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -27,34 +27,42 @@ def _bgmv_expand_kernel(
     cn_stride,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
+    SPLIT_N: tl.constexpr,
     ADD_INPUTS: tl.constexpr,
     CAST_TYPE: tl.constexpr,
 ):
     """
     C=A@B, and B is col-major matrix
     """
-    cur_batch = tl.program_id(axis=0)
+    pid_sn = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
     offset_k = tl.arange(0, BLOCK_K)
     offset_n = tl.arange(0, BLOCK_N)
-    # tl.max_contiguous(offset_k, BLOCK_K)
     tiled_a = tl.load(
         input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
         mask=offset_k < K,
         other=0,
     )  # [BLOCK_K]
-    b_ptr = lora_ptr + l0_stride * lora_index
+
+    split_n_length = tl.cdiv(N, SPLIT_N)
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
     # sliding  to  next row-block
-
-    for n in range(0, N, BLOCK_N):
+    b_ptr = (
+        lora_ptr
+        + l0_stride * lora_index
+        + pid_sn * split_n_length * lora_k_stride
+    )
+    for n in range(0, split_n_length, BLOCK_N):
         current_n = n + offset_n
         # vector load
         current_n_c = tl.max_contiguous(current_n, BLOCK_N)
-        b_ptr_mask = (current_n[:, None] < N) & (offset_k[None, :] < K)
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (
+            offset_k[None, :] < K
+        )
 
         tiled_b = tl.load(
             b_ptr
@@ -66,8 +74,13 @@ def _bgmv_expand_kernel(
 
         accumulator = tl.sum(tiled_a * tiled_b, 1)
 
-        c_ptr = out_ptr + cur_batch * cm_stride + current_n * cn_stride
-        c_mask = current_n < N
+        c_ptr = (
+            out_ptr
+            + cur_batch * cm_stride
+            + pid_sn * split_n_length
+            + current_n * cn_stride
+        )
+        c_mask = current_n < split_n_length
         if ADD_INPUTS:
             tiled_out = tl.load(c_ptr, mask=c_mask)
             accumulator += tiled_out
@@ -118,8 +131,9 @@ def bgmv_expand(
     # TODO tuning this config
 
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_N = 512 
+    BLOCK_N = 512
     BLOCK_K = triton.next_power_of_2(K)
+    SPLIT_N = 8
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
@@ -128,9 +142,9 @@ def bgmv_expand(
     ]:
         CAST_TYPE = True
     grid = [
+        SPLIT_N,
         batchs,
     ]
-    config = {"num_stages": 4, "num_warps": 8}
     _bgmv_expand_kernel[grid](
         inputs,
         lora_b_weights,
@@ -147,8 +161,8 @@ def bgmv_expand(
         output_tensor.stride(1),
         BLOCK_N,
         BLOCK_K,
+        SPLIT_N,
         ADD_INPUTS,
         CAST_TYPE,
-        **config,
     )
     return

From 891df631f3d4d18243c117c7e63183b12f2825fe Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 5 Jun 2024 00:35:01 +0800
Subject: [PATCH 13/71] add bgmv

---
 tests/lora/test_triton_punica.py   |  56 +++++++---------
 vllm/lora/layers.py                |  15 +++--
 vllm/lora/models.py                |  22 +++---
 vllm/lora/ops/bgmv_expand.py       |  29 +++-----
 vllm/lora/ops/bgmv_expand_slice.py |  56 +++++++---------
 vllm/lora/ops/bgmv_shrink.py       |   7 +-
 vllm/lora/ops/sgmv_expand_slice.py |   2 +-
 vllm/lora/ops/sgmv_shrink.py       |  42 +++++-------
 vllm/lora/punica.py                | 103 ++++++++++++++++++++++++++++-
 9 files changed, 200 insertions(+), 132 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 74bab70f1aad..6aea2573d962 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -62,7 +62,7 @@
     128000,
     128256,
 ]
-BATCHS = [i for i in range(0, 64, 8)]
+BATCHS = [i for i in range(0, 128, 8)]
 NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256]
 DTYPES = [torch.half, torch.bfloat16, torch.float32]
 MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
@@ -101,7 +101,7 @@ def _torch_groupgemm(
     out_list = []
     current_offset = 0
     for lora_index, b_length in zip(range(batchs), seq_len_tensor):
-        input_weight = inputs[current_offset : b_length + current_offset, :]
+        input_weight = inputs[current_offset:b_length + current_offset, :]
         current_offset += b_length
         lora_weight = lora_weights[lora_indices_tensor[lora_index]]
         result = torch.nn.functional.linear(input_weight, lora_weight)
@@ -115,29 +115,27 @@ def _torch_groupgemm(
     return
 
 
-def _generate_data(
-    batchs, hidden_size, lora_nums, max_rank, max_length, dtype, op_type, device
-):
+def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
+                   op_type, device):
     if max_length == 1:
         max_length += 1
-    seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device)
+    seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
     ).to(device)
     total_tokens = seq_len_tensor.sum()
     if op_type == "shrink":
-        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(
-            device
-        )
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
         lora_weights = torch.rand(
             (lora_nums, max_rank, hidden_size),  # col-major
             dtype=dtype,
         ).to(device)
         # shrink op need atomic_add, so output is initinized by 0
-        ref_out_tensor = torch.zeros(
-            (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
-        )
+        ref_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=dtype,
+                                     device=inputs_tensor.device)
         # NOTE  shrink kernel using torch.float32 as output type
         our_out_tensor = torch.zeros(
             (total_tokens, max_rank),
@@ -163,16 +161,15 @@ def _generate_data(
         # Ensure the same input.
         our_out_tensor = ref_out_tensor.clone()
 
-    lora_indices_tensor = torch.randint(
-        0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,)
-    ).to(device)
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batchs, )).to(device)
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batchs):
         lora_index = lora_indices_tensor[b_id]
-        indices[
-            current_offset : current_offset + seq_len_tensor[b_id]
-        ] = lora_index.item()
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = lora_index.item()
         current_offset += seq_len_tensor[b_id].item()
     return (
         inputs_tensor,
@@ -186,7 +183,7 @@ def _generate_data(
     )
 
 
-@pytest.mark.skip("work in progress")
+# @pytest.mark.skip("work in progress")
 @pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
@@ -222,8 +219,8 @@ def test_sgmv_torch(
         seq_len_tensor,
         indices,
     ) = _generate_data(
-        batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, device
-    )  # The sequence length is restricted to the range [1, 1024].
+        batchs, hidden_size, num_loras, rank, 1024, dtype, op_type,
+        device)  # The sequence length is restricted to the range [1, 1024].
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
         max_seq_length = max_seq_length[0].item()
@@ -268,7 +265,7 @@ def test_sgmv_torch(
     assert_close(our_out_tensor, ref_out_tensor)
 
 
-@pytest.mark.skip("work in progress")
+# @pytest.mark.skip("work in progress")
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -300,9 +297,8 @@ def test_triton_sgmv_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(
-        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
-    )
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+                       op_type, device)
 
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
@@ -380,9 +376,8 @@ def test_triton_bgmv_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(
-        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
-    )
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+                       op_type, device)
 
     if op_type == "shrink":
         bgmv_shrink(
@@ -446,9 +441,8 @@ def test_sgmv_expand_nslice(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(
-        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
-    )
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+                       op_type, device)
 
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index a3a40ad0bd24..ba7f52ff2fb1 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -126,10 +126,11 @@ def _apply_lora_triton(
 
     batch_size = batch_mlen_stage_lst[0]
     max_length = batch_mlen_stage_lst[1]
+    is_prefilling = bool(batch_mlen_stage_lst[2])
 
     add_lora_triton(output, x, lora_a_stacked, lora_b_stacked,
                     b_seq_start_tensor, seq_length_tensor, lora_index_tensor,
-                    batch_size, max_length, 0, 1.0)
+                    batch_size, max_length, 0, 1.0, is_prefilling)
     return output.view_as(org_output)
 
 
@@ -206,7 +207,7 @@ def _apply_lora_triton_nslice(
 
     batch_size = batch_mlen_stage_lst[0]
     max_length = batch_mlen_stage_lst[1]
-
+    is_prefilling = bool(batch_mlen_stage_lst[2])
     offset_left = 0
     #TODO fuse these kernel
     for slice_idx in range(len(output_slices)):
@@ -214,7 +215,7 @@ def _apply_lora_triton_nslice(
                               lora_b_stacked[slice_idx], b_seq_start_tensor,
                               seq_length_tensor, lora_index_tensor, batch_size,
                               max_length, 0, 1.0, offset_left,
-                              output_slices[slice_idx])
+                              output_slices[slice_idx], is_prefilling)
         offset_left += output_slices[slice_idx]
 
     return output.view_as(org_output)
@@ -558,8 +559,8 @@ def apply(self, x: torch.Tensor,
         _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
                            self.b_seq_start_tensor[:batch_size],
                            self.seq_length_tensor[:batch_size],
-                           self.indices[:batch_size], self.batch_mlen_stage_lst,
-                           output)
+                           self.indices[:batch_size],
+                           self.batch_mlen_stage_lst, output)
         return output
 
     def forward(self, input_):
@@ -1125,8 +1126,8 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
                            self.b_seq_start_tensor[:batch_size],
                            self.seq_length_tensor[:batch_size],
-                           self.indices[:batch_size], self.batch_mlen_stage_lst,
-                           output)
+                           self.indices[:batch_size],
+                           self.batch_mlen_stage_lst, output)
         return output
 
     # def apply(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 1cdc3a03b8bf..438eeff1ff0c 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -24,7 +24,7 @@
 # _BATCH_SIZES_TO_CAPTURE.It needs to be updated if _BATCH_SIZES_TO_CAPTURE
 # is changed.
 
-_MAX_BATCHS = 256+16 #max(_BATCH_SIZES_TO_CAPTURE)+16
+_MAX_BATCHS = 256 + 16  #max(_BATCH_SIZES_TO_CAPTURE)+16
 
 logger = init_logger(__name__)
 
@@ -448,10 +448,10 @@ def __init__(
                                               dtype=torch.long,
                                               device="cuda")
 
-        # element contains batch_size, max_length, 0 or 1. Use 1 for the 
-        # prefilling stage and 0 for the decoding stage.The reason for 
-        # distinguishing between the prefilling and decoding stage is that 
-        # if we have implemented bgmv, it can be utilized during the decoding 
+        # element contains batch_size, max_length, 0 or 1. Use 1 for the
+        # prefilling stage and 0 for the decoding stage.The reason for
+        # distinguishing between the prefilling and decoding stage is that
+        # if we have implemented bgmv, it can be utilized during the decoding
         # stage.
         self.batch_mlen_stage_lst = [-1] * 3
         self._create_lora_modules()
@@ -582,12 +582,12 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
                                              dtype=torch.long,
                                              device="cuda")
             self.seq_length_tensor[:batchs].copy_(seq_length_tensor)
-            temp_tensor=torch.cumsum(
-                seq_length_tensor,
-                dim=0,
-                dtype=seq_length_tensor.dtype)
-            self.b_seq_start_tensor[1:temp_tensor.size(0)+1].copy_(temp_tensor)
-            
+            temp_tensor = torch.cumsum(seq_length_tensor,
+                                       dim=0,
+                                       dtype=seq_length_tensor.dtype)
+            self.b_seq_start_tensor[1:temp_tensor.size(0) +
+                                    1].copy_(temp_tensor)
+
             self.batch_mlen_stage_lst[:] = [
                 batchs,
                 max(mapping.seq_lens), 1 if mapping.is_prefilling else 0
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index aa572151fb41..6132b6047997 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -51,35 +51,26 @@ def _bgmv_expand_kernel(
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
     # sliding  to  next row-block
-    b_ptr = (
-        lora_ptr
-        + l0_stride * lora_index
-        + pid_sn * split_n_length * lora_k_stride
-    )
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             pid_sn * split_n_length * lora_k_stride)
     for n in range(0, split_n_length, BLOCK_N):
         current_n = n + offset_n
         # vector load
         current_n_c = tl.max_contiguous(current_n, BLOCK_N)
-        b_ptr_mask = (current_n[:, None] < split_n_length) & (
-            offset_k[None, :] < K
-        )
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
+                                                              < K)
 
         tiled_b = tl.load(
-            b_ptr
-            + current_n_c[:, None] * lora_k_stride
-            + offset_k[None, :] * lora_n_stride,
+            b_ptr + current_n_c[:, None] * lora_k_stride +
+            offset_k[None, :] * lora_n_stride,
             mask=b_ptr_mask,
             other=0.0,
         )  # [BLOCK_N,BLOCK_K]
 
         accumulator = tl.sum(tiled_a * tiled_b, 1)
 
-        c_ptr = (
-            out_ptr
-            + cur_batch * cm_stride
-            + pid_sn * split_n_length
-            + current_n * cn_stride
-        )
+        c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
+                 current_n * cn_stride)
         c_mask = current_n < split_n_length
         if ADD_INPUTS:
             tiled_out = tl.load(c_ptr, mask=c_mask)
@@ -137,8 +128,8 @@ def bgmv_expand(
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
+            torch.float16,
+            torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = [
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index a197f5eddb8b..63dc3cabb5a9 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -28,52 +28,51 @@ def _bgmv_expand_slice_kernel(
     slice_offset,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
+    SPLIT_N: tl.constexpr,
     ADD_INPUTS: tl.constexpr,
     CAST_TYPE: tl.constexpr,
 ):
     """
     C=A@B, and B is col-major matrix
     """
-    cur_batch = tl.program_id(axis=0)
+    pid_sn = tl.program_id(axis=0)
+    cur_batch = tl.program_id(axis=1)
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
     offset_k = tl.arange(0, BLOCK_K)
     offset_n = tl.arange(0, BLOCK_N)
-    # tl.max_contiguous(offset_k, BLOCK_K)
     tiled_a = tl.load(
         input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
         mask=offset_k < K,
         other=0,
     )  # [BLOCK_K]
-    b_ptr = lora_ptr + l0_stride * lora_index
+
+    split_n_length = tl.cdiv(N, SPLIT_N)
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
     # sliding  to  next row-block
-
-    for n in range(0, N, BLOCK_N):
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             pid_sn * split_n_length * lora_k_stride)
+    for n in range(0, split_n_length, BLOCK_N):
         current_n = n + offset_n
         # vector load
         current_n_c = tl.max_contiguous(current_n, BLOCK_N)
-        b_ptr_mask = (current_n[:, None] < N) & (offset_k[None, :] < K)
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
+                                                              < K)
 
         tiled_b = tl.load(
-            b_ptr
-            + current_n_c[:, None] * lora_k_stride
-            + offset_k[None, :] * lora_n_stride,
+            b_ptr + current_n_c[:, None] * lora_k_stride +
+            offset_k[None, :] * lora_n_stride,
             mask=b_ptr_mask,
             other=0.0,
         )  # [BLOCK_N,BLOCK_K]
 
         accumulator = tl.sum(tiled_a * tiled_b, 1)
 
-        c_ptr = (
-            out_ptr
-            + cur_batch * cm_stride
-            + slice_offset  # slice size
-            + current_n * cn_stride
-        )
-        c_mask = current_n < N
+        c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
+                 slice_offset * cn_stride +current_n * cn_stride)
+        c_mask = current_n < split_n_length
         if ADD_INPUTS:
             tiled_out = tl.load(c_ptr, mask=c_mask)
             accumulator += tiled_out
@@ -86,32 +85,23 @@ def bgmv_expand_slice(
     lora_b_weights: torch.Tensor,
     output_tensor: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batchs: int,
-    max_seq_length: int,
     slice_offset: int,
     slice_size: int,
+    batchs: int,
     add_inputs: bool = False,
 ):
-    """_summary_
-
+    """
     Args:
         inputs (torch.Tensor): input tensor
         lora_b_weights (torch.Tensor): lora'a weight
         output_tensor (torch.Tensor): output tensor
-        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g.,if the sequence length is [4, 6], it is
-            [0, 4, 10].
-        seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
-            length of the sequences  in the batch
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
         batchs (int): batch size
-        max_seq_length (int):  The max sequence lengths of the sequences
-            in the batch
         slice_offst (int): output_tensor's offst
         slice_size (int): current output_tensor's size
         add_inputs (bool, optional): _description_. Defaults to False.
+        cast_type (bool, optional): _description_. Defaults to False.
     """
 
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
@@ -134,18 +124,21 @@ def bgmv_expand_slice(
 
     assert lora_b_weights.is_contiguous()
 
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
     # TODO tuning this config
+
+    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
     BLOCK_N = 512
     BLOCK_K = triton.next_power_of_2(K)
+    SPLIT_N = 8
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
+            torch.float16,
+            torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = [
+        SPLIT_N,
         batchs,
     ]
     _bgmv_expand_slice_kernel[grid](
@@ -165,6 +158,7 @@ def bgmv_expand_slice(
         slice_offset,
         BLOCK_N,
         BLOCK_K,
+        SPLIT_N,
         ADD_INPUTS,
         CAST_TYPE,
     )
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index ed208796633a..5495e6f54353 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -41,7 +41,7 @@ def _bgmv_shrink_kernel(
     a_ptr = input_ptr + cur_batch * xm_stride
     b_ptr = lora_ptr + l0_stride * lora_index
     rank_mask = offset_n[:, None] < N
-    accumulator = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32)
     for k in range(0, K, BLOCK_K * SPLIT_K):
         current_k = k + offset_k
         # vector load
@@ -54,9 +54,8 @@ def _bgmv_shrink_kernel(
         b_ptr_mask = (rank_mask < N) & (current_k[None, :] < K)
 
         tiled_b = tl.load(
-            b_ptr
-            + offset_n[:, None] * lora_k_stride
-            + current_k[None, :] * lora_n_stride,
+            b_ptr + offset_n[:, None] * lora_k_stride +
+            current_k[None, :] * lora_n_stride,
             mask=b_ptr_mask,
             other=0.0,
         )  # [BLOCK_N,BLOCK_K]
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 41e65d2a15d4..72ed81bcbbd3 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -60,7 +60,7 @@ def _sgmv_expand_slice_kernel(
     offset_k = tl.arange(0, BLOCK_K)
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-    
+
     a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
              offset_k[None, :] * xk_stride, )
     b_ptr = (lora_ptr + l0_stride * lora_index +
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index d27bcd15880b..3dd48a8bafac 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -60,18 +60,10 @@ def _sgmv_shrink_kernel(
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
 
-    a_ptr = (
-        input_ptr
-        + cur_seq_start * xm_stride
-        + ram[:, None] * xm_stride
-        + offset_k[None, :] * xk_stride
-    )
-    b_ptr = (
-        lora_ptr
-        + l0_stride * lora_index
-        + rbn[None, :] * lora_k_stride
-        + offset_k[:, None] * lora_n_stride
-    )
+    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
+             offset_k[None, :] * xk_stride)
+    b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
+             offset_k[:, None] * lora_n_stride)
 
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
@@ -80,12 +72,12 @@ def _sgmv_shrink_kernel(
             tiled_b = tl.load(b_ptr)
         else:
             k_remaining = K - k * (BLOCK_K * SPLIT_K)
-            tiled_a = tl.load(
-                a_ptr, mask=offset_k[None, :] < k_remaining, other=0.0
-            )
-            tiled_b = tl.load(
-                b_ptr, mask=offset_k[:, None] < k_remaining, other=0.0
-            )
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :] < k_remaining,
+                              other=0.0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None] < k_remaining,
+                              other=0.0)
         accumulator += tl.dot(tiled_a, tiled_b)
 
         a_ptr += BLOCK_K * SPLIT_K * xk_stride
@@ -93,14 +85,10 @@ def _sgmv_shrink_kernel(
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
 
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = (
-        out_ptr
-        + offset_cm[:, None] * cm_stride
-        + offset_cn[None, :] * cn_stride
-    )
-    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
-        offset_cn[None, :] < N
-    )
+    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
+             offset_cn[None, :] * cn_stride)
+    c_mask = (offset_cm[:, None] <
+              (cur_seq_start + M)) & (offset_cn[None, :] < N)
     accumulator *= scaling
     # handles write-back with reduction-splitting
     if SPLIT_K == 1:
@@ -172,7 +160,7 @@ def sgmv_shrink(
         SPLIT_K,
         batchs,
     ]
-    
+
     # grid = lambda META: (triton.cdiv(max_seq_length, META[
     #     'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[
     #     'SPLIT_K'],batchs)
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 4f4fccca8051..f3ebc29ecfea 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -6,7 +6,10 @@
 
 from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_shrink import bgmv_shrink
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 
 
 def _raise_import_error(e):
@@ -164,6 +167,7 @@ def add_lora_triton(
     max_length: int,
     layer_idx: int,
     scale: float,
+    is_prefilling: bool,
     *,
     buffer: Optional[torch.Tensor] = None,
 ):
@@ -175,6 +179,49 @@ def add_lora_triton(
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
                              device=x.device)
+    if is_prefilling:
+        _lora_sgmv(
+            y,
+            x,
+            wa_t_all,
+            wb_t_all,
+            b_seq_start_tensor,
+            seq_length_tensor,
+            lora_indices_tensor,
+            batch_size,
+            max_length,
+            layer_idx,
+            scale,
+            buffer=buffer,
+        )
+    else:
+        _lora_bgmv(
+            y,
+            x,
+            wa_t_all,
+            wb_t_all,
+            lora_indices_tensor,
+            batch_size,
+            layer_idx,
+            scale,
+            buffer=buffer,
+        )
+
+
+def _lora_sgmv(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    b_seq_start_tensor: torch.Tensor,
+    seq_length_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batch_size: int,
+    max_length: int,
+    layer_idx: int,
+    scale: float,
+    buffer: torch.Tensor,
+):
     sgmv_shrink(
         x,
         wa_t_all,
@@ -199,6 +246,26 @@ def add_lora_triton(
     )
 
 
+def _lora_bgmv(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batch_size: int,
+    layer_idx: int,
+    scale: float,
+    buffer: torch.Tensor,
+):
+    bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, batch_size, scale)
+    bgmv_expand(buffer,
+                wb_t_all,
+                y,
+                lora_indices_tensor,
+                batch_size,
+                add_inputs=True)
+
+
 def add_lora_slice(
     y: torch.Tensor,
     x: torch.Tensor,
@@ -288,6 +355,7 @@ def add_lora_triton_slice(
     scale: float,
     y_offset: int,
     y_slice_size: int,
+    is_prefilling: bool,
     *,
     buffer: Optional[torch.Tensor] = None,
 ):
@@ -315,7 +383,7 @@ def add_lora_triton_slice(
       scale: Scaling factor.
       y_offset: Offset to apply to the starting column of y.
       y_slice_size: Size of the y column slice.
-    # """
+    #"""
     # try:
     #     import vllm._punica_C as punica_kernels
     # except ImportError as e:
@@ -329,6 +397,23 @@ def add_lora_triton_slice(
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
                              device=x.device)
+    if is_prefilling:
+        _lora_sgmv_nslice(y, x, wa_t_all, wb_t_all, b_seq_start_tensor,
+                          seq_length_tensor, lora_indices_tensor, batch_size,
+                          max_length, layer_idx, scale, y_offset, y_slice_size,
+                          buffer)
+    else:
+        _lora_bgmv_nslice(y, x, wa_t_all, wb_t_all, lora_indices_tensor,
+                          batch_size, layer_idx, scale, y_offset, y_slice_size,
+                          buffer)
+
+
+def _lora_sgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor,
+                      wb_t_all: torch.Tensor, b_seq_start_tensor: torch.Tensor,
+                      seq_length_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor, batch_size: int,
+                      max_length: int, layer_idx: int, scale: float,
+                      y_offset: int, y_slice_size: int, buffer):
     sgmv_shrink(
         x,
         wa_t_all,
@@ -353,3 +438,19 @@ def add_lora_triton_slice(
         y_slice_size,
         add_inputs=True,
     )
+
+
+def _lora_bgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor,
+                      wb_t_all: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor, batch_size: int,
+                      layer_idx: int, scale: float, y_offset: int,
+                      y_slice_size: int, buffer):
+    bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, batch_size, scale)
+    bgmv_expand_slice(buffer,
+                      wb_t_all,
+                      y,
+                      lora_indices_tensor,
+                      y_offset,
+                      y_slice_size,
+                      batch_size,
+                      add_inputs=True)

From ab85bb54f30e786901a81c555e43b13293e62700 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 5 Jun 2024 00:58:36 +0800
Subject: [PATCH 14/71] add bgmv

---
 vllm/lora/layers.py                | 69 ++++++++++++++----------------
 vllm/lora/ops/bgmv_expand_slice.py |  2 +-
 2 files changed, 32 insertions(+), 39 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index ba7f52ff2fb1..b2605bf96b21 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -127,10 +127,12 @@ def _apply_lora_triton(
     batch_size = batch_mlen_stage_lst[0]
     max_length = batch_mlen_stage_lst[1]
     is_prefilling = bool(batch_mlen_stage_lst[2])
-
+    # maybe we need not  restrict  range to [:batch_size]
     add_lora_triton(output, x, lora_a_stacked, lora_b_stacked,
-                    b_seq_start_tensor, seq_length_tensor, lora_index_tensor,
-                    batch_size, max_length, 0, 1.0, is_prefilling)
+                    b_seq_start_tensor[:batch_size],
+                    seq_length_tensor[:batch_size],
+                    lora_index_tensor[:batch_size], batch_size, max_length, 0,
+                    1.0, is_prefilling)
     return output.view_as(org_output)
 
 
@@ -211,11 +213,11 @@ def _apply_lora_triton_nslice(
     offset_left = 0
     #TODO fuse these kernel
     for slice_idx in range(len(output_slices)):
-        add_lora_triton_slice(output, x, lora_a_stacked[slice_idx],
-                              lora_b_stacked[slice_idx], b_seq_start_tensor,
-                              seq_length_tensor, lora_index_tensor, batch_size,
-                              max_length, 0, 1.0, offset_left,
-                              output_slices[slice_idx], is_prefilling)
+        add_lora_triton_slice(
+            output, x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx],
+            b_seq_start_tensor[:batch_size], seq_length_tensor[:batch_size],
+            lora_index_tensor[:batch_size], batch_size, max_length, 0, 1.0,
+            offset_left, output_slices[slice_idx], is_prefilling)
         offset_left += output_slices[slice_idx]
 
     return output.view_as(org_output)
@@ -554,13 +556,9 @@ def set_mapping(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        batch_size = self.batch_mlen_stage_lst[0]
-        # maybe we need not  restrict  range to [:batch_size]
         _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
-                           self.b_seq_start_tensor[:batch_size],
-                           self.seq_length_tensor[:batch_size],
-                           self.indices[:batch_size],
-                           self.batch_mlen_stage_lst, output)
+                           self.b_seq_start_tensor, self.seq_length_tensor,
+                           self.indices, self.batch_mlen_stage_lst, output)
         return output
 
     def forward(self, input_):
@@ -722,14 +720,13 @@ def apply(self, x: torch.Tensor,
         #     output,
         #     (self.output_dim, self.output_dim),
         # )
-        batch_size = self.batch_mlen_stage_lst[0]
         _apply_lora_triton_nslice(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
-            self.b_seq_start_tensor[:batch_size],
-            self.seq_length_tensor[:batch_size],
-            self.indices[:batch_size],
+            self.b_seq_start_tensor,
+            self.seq_length_tensor,
+            self.indices,
             self.batch_mlen_stage_lst,
             output,
             (self.output_dim, self.output_dim),
@@ -998,14 +995,13 @@ def apply(self, x: torch.Tensor,
         #     output,
         #     self.output_slices,
         # )
-        batch_size = self.batch_mlen_stage_lst[0]
         _apply_lora_triton_nslice(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
-            self.b_seq_start_tensor[:batch_size],
-            self.seq_length_tensor[:batch_size],
-            self.indices[:batch_size],
+            self.b_seq_start_tensor,
+            self.seq_length_tensor,
+            self.indices,
             self.batch_mlen_stage_lst,
             output,
             self.output_slices,
@@ -1121,13 +1117,10 @@ def set_mapping(
 
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
-        batch_size = self.batch_mlen_stage_lst[0]
         # maybe we need not  restrict  range to [:batch_size]
         _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
-                           self.b_seq_start_tensor[:batch_size],
-                           self.seq_length_tensor[:batch_size],
-                           self.indices[:batch_size],
-                           self.batch_mlen_stage_lst, output)
+                           self.b_seq_start_tensor, self.seq_length_tensor,
+                           self.indices, self.batch_mlen_stage_lst, output)
         return output
 
     # def apply(self, x: torch.Tensor) -> torch.Tensor:
@@ -1373,17 +1366,17 @@ def _get_logits(
             logits,
         )
 
-        # batch_size=self.batch_mlen_stage_lst[0]
-        # _apply_lora_triton(hidden_states, self.lora_a_stacked, self.lora_b_stacked,
-        #                    self.b_seq_start_tensor[:batch_size],
-        #                    self.seq_length_tensor[:batch_size],
-        #                    self.indices[:self.indices_len[1]],
-        #                    self.batch_mlen_stage_lst, logits_temp)
-        # flag=torch.allclose(logits_temp,logits,rtol=1e-2,atol=1e-2)
-        # if flag:
-        #     print("pass")
-        # else:
-        #     print("error")
+        logits_temp = logits.clone()
+        _apply_lora_triton(hidden_states, self.lora_a_stacked,
+                           self.lora_b_stacked, self.b_seq_start_tensor,
+                           self.seq_length_tensor,
+                           self.indices[:self.indices_len[1]],
+                           self.batch_mlen_stage_lst, logits_temp)
+        flag = torch.allclose(logits_temp, logits, rtol=1e-2, atol=1e-2)
+        if flag:
+            print("pass")
+        else:
+            print("error")
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
 
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 63dc3cabb5a9..5e30312c7e18 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -71,7 +71,7 @@ def _bgmv_expand_slice_kernel(
         accumulator = tl.sum(tiled_a * tiled_b, 1)
 
         c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
-                 slice_offset * cn_stride +current_n * cn_stride)
+                 slice_offset * cn_stride + current_n * cn_stride)
         c_mask = current_n < split_n_length
         if ADD_INPUTS:
             tiled_out = tl.load(c_ptr, mask=c_mask)

From f99b3d27ae454a26caa61a943900ae682cfcf4cb Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 5 Jun 2024 09:33:31 +0800
Subject: [PATCH 15/71] repalcing punica completed

---
 vllm/lora/layers.py | 143 +--------------
 vllm/lora/punica.py | 437 ++++++++++++++++++++++----------------------
 2 files changed, 225 insertions(+), 355 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index b2605bf96b21..ff922a14d879 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -16,8 +16,8 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-from vllm.lora.punica import (add_lora, add_lora_triton, add_lora_slice,
-                              add_lora_triton_slice, bgmv)
+from vllm.lora.punica import (add_lora_triton,
+                              add_lora_triton_slice)
 from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -63,38 +63,6 @@ def dec(*args, **kwargs):
 
     return dec
 
-
-def _apply_lora(
-    x: torch.Tensor,
-    lora_a_stacked: torch.Tensor,
-    lora_b_stacked: torch.Tensor,
-    indices: torch.Tensor,
-    output: torch.Tensor,
-):
-    """Applies lora to each input.
-
-    This method applies all loras to each input. It uses the
-    indices vector to determine which lora yields the
-    correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the
-    output.
-
-    Input shapes:
-        x:               (batch_size, hidden_dim)
-        lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked:  (num_loras, output_dim, lora_rank)
-        indices:         (batch_size)
-        output:          (batch_size, output_dim)
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-    buffer = add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0,
-                      1.0)
-    return buffer, output.view_as(org_output)
-
-
 def _apply_lora_triton(
     x: torch.Tensor,
     lora_a_stacked: torch.Tensor,
@@ -136,47 +104,6 @@ def _apply_lora_triton(
     return output.view_as(org_output)
 
 
-def _apply_lora_packed_nslice(
-    x: torch.Tensor,
-    lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    output_slices: Tuple[int, ...],
-):
-    """Applies lora to each input.
-
-    This method applies all loras to each input. It uses the
-    indices vector to determine which lora yields the
-    correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the
-    output.
-
-    This method is used for layers that are composed of multiple sublayers
-    (slices) packed together.
-
-    Input shapes:
-        x:                 (batch_size, hidden_dim)
-        lora_a_stacked:    3 element tuple of (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked:    3 element tuple of (num_loras, output_dim, lora_rank)
-        indices:           (batch_size)
-        output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...),
-                           where n is number of slices
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-    offset_left = 0
-    for slice_idx in range(len(output_slices)):
-        add_lora_slice(output, x, lora_a_stacked[slice_idx],
-                       lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left,
-                       output_slices[slice_idx])
-        offset_left += output_slices[slice_idx]
-    return output.view_as(org_output)
-
-
 def _apply_lora_triton_nslice(
     x: torch.Tensor,
     lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
@@ -711,15 +638,6 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        # output_temp=output.clone()
-        # _apply_lora_packed_nslice(
-        #     x,
-        #     self.lora_a_stacked,
-        #     self.lora_b_stacked,
-        #     self.indices[:self.indices_len[0]],
-        #     output,
-        #     (self.output_dim, self.output_dim),
-        # )
         _apply_lora_triton_nslice(
             x,
             self.lora_a_stacked,
@@ -731,11 +649,6 @@ def apply(self, x: torch.Tensor,
             output,
             (self.output_dim, self.output_dim),
         )
-        # flag=torch.allclose(output,output_temp,1e-2,1e-2)
-        # if flag:
-        #     print("pass")
-        # else:
-        #     print()
         return output
 
     @classmethod
@@ -987,14 +900,6 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        # _apply_lora_packed_nslice(
-        #     x,
-        #     self.lora_a_stacked,
-        #     self.lora_b_stacked,
-        #     self.indices[:self.indices_len[0]],
-        #     output,
-        #     self.output_slices,
-        # )
         _apply_lora_triton_nslice(
             x,
             self.lora_a_stacked,
@@ -1123,31 +1028,6 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
                            self.indices, self.batch_mlen_stage_lst, output)
         return output
 
-    # def apply(self, x: torch.Tensor) -> torch.Tensor:
-    #     output = self.base_layer.quant_method.apply(self.base_layer, x)
-    #     temp_output = output.clone()
-    #     output2 = output.clone()
-    #     mid_buffer,_=_apply_lora(
-    #         x,
-    #         self.lora_a_stacked,
-    #         self.lora_b_stacked,
-    #         self.indices[:self.indices_len[0]],
-    #         output,
-    #     )
-    #     batch_size = self.batch_mlen_stage_lst[0]
-    #     mid2_buffer,_=_apply_lora_triton(x, self.lora_a_stacked,
-    #                         self.lora_b_stacked,
-    #                        self.b_seq_start_tensor[:batch_size],
-    #                        self.seq_length_tensor[:batch_size],
-    #                        self.indices[:batch_size],
-    #                        self.batch_mlen_stage_lst, output)
-    #     flag = torch.allclose(mid_buffer, mid2_buffer, 3e-2, 2e-2)
-    #     # if not flag:
-    #     #     print("error")
-    #     # else:
-    #     #     print("pass")
-    #     return temp_output
-
     def forward(self, input_):
         """Forward of RowParallelLinear
 
@@ -1358,25 +1238,14 @@ def _get_logits(
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
 
-        _apply_lora(
-            hidden_states,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[1]],
-            logits,
-        )
-
-        logits_temp = logits.clone()
+        batch_mlen_stage_lst=self.batch_mlen_stage_lst.copy()
+        # LogitsProcessorWithLoRA always using bgmv
+        batch_mlen_stage_lst[2]=False
         _apply_lora_triton(hidden_states, self.lora_a_stacked,
                            self.lora_b_stacked, self.b_seq_start_tensor,
                            self.seq_length_tensor,
                            self.indices[:self.indices_len[1]],
-                           self.batch_mlen_stage_lst, logits_temp)
-        flag = torch.allclose(logits_temp, logits, rtol=1e-2, atol=1e-2)
-        if flag:
-            print("pass")
-        else:
-            print("error")
+                           batch_mlen_stage_lst, logits)
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
 
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index f3ebc29ecfea..90ce268c903b 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -12,148 +12,225 @@
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 
 
-def _raise_import_error(e):
-    if torch.cuda.get_device_capability() < (8, 0):
-        raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0") from e
-    else:
-        raise ImportError(
-            "punica LoRA kernels could not be imported. If you built vLLM "
-            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set.") from e
-
-
-def bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
-        matrices.
-      indicies: Shape: `[B]`. Indices of the weight matrices.
-      layer_idx: Layer index of the weight matrices.
-      scale: Scaling factor.
-    """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-
-    punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
-
-
-def dispatch_bgmv_low_level(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-    y_offset: int,
-    y_slice_size: int,
-):
-    """
-    Same as `bgmv` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
-
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of
-        all of the transposed LoRA matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
-    """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-    punica_kernels.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        x.size(1),
-        y_slice_size,
-        y_offset,
-    )
-
-
-def add_lora(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-    *,
-    buffer: Optional[torch.Tensor] = None,
-):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      buffer: Optional. Shape: `[B, R]`. Temporary buffer.
-    """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
-    punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-    punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
-                                 scale)
-
+# def _raise_import_error(e):
+#     if torch.cuda.get_device_capability() < (8, 0):
+#         raise ImportError(
+#             "punica LoRA kernels require compute capability >= 8.0") from e
+#     else:
+#         raise ImportError(
+#             "punica LoRA kernels could not be imported. If you built vLLM "
+#             "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
+#             "was set.") from e
+
+
+# def bgmv(
+#     y: torch.Tensor,
+#     x: torch.Tensor,
+#     w_t_all: torch.Tensor,
+#     indicies: torch.LongTensor,
+#     layer_idx: int,
+#     scale: float,
+# ):
+#     """
+#     Semantics:
+#       y[i] += (
+#           x[i].unsqueeze(0)
+#           @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+#           * scale
+#         ).squeeze(0)
+
+#     Args:
+#       y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+#       x: Shape: `[B, H1]`. Input vectors.
+#       w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
+#         matrices.
+#       indicies: Shape: `[B]`. Indices of the weight matrices.
+#       layer_idx: Layer index of the weight matrices.
+#       scale: Scaling factor.
+#     """
+#     try:
+#         import vllm._punica_C as punica_kernels
+#     except ImportError as e:
+#         _raise_import_error(e)
+
+#     punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
+
+
+# def dispatch_bgmv_low_level(
+#     y: torch.Tensor,
+#     x: torch.Tensor,
+#     w_t_all: torch.Tensor,
+#     indicies: torch.LongTensor,
+#     layer_idx: int,
+#     scale: float,
+#     y_offset: int,
+#     y_slice_size: int,
+# ):
+#     """
+#     Same as `bgmv` but you can operate on slices of y.
+#     Pass whole y, define y_offset and y_slice_size.
+
+#     Semantics:
+#       y[i] += (
+#           x[i].unsqueeze(0)
+#           @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+#           * scale
+#         ).squeeze(0)
+
+#     Args:
+#       y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+#       x: Shape: `[B, H1]`. Input vectors.
+#       w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of
+#         all of the transposed LoRA matrices.
+#       indicies: Shape: `[B]`. Indices of the LoRA weights.
+#       layer_idx: Layer index of LoRA weights.
+#       scale: Scaling factor.
+#       y_offset: Offset to apply to the starting column of y.
+#       y_slice_size: Size of the y column slice.
+#     """
+#     try:
+#         import vllm._punica_C as punica_kernels
+#     except ImportError as e:
+#         _raise_import_error(e)
+#     punica_kernels.dispatch_bgmv_low_level(
+#         y,
+#         x,
+#         w_t_all,
+#         indicies,
+#         layer_idx,
+#         scale,
+#         x.size(1),
+#         y_slice_size,
+#         y_offset,
+#     )
+
+
+# def add_lora(
+#     y: torch.Tensor,
+#     x: torch.Tensor,
+#     wa_t_all: torch.Tensor,
+#     wb_t_all: torch.Tensor,
+#     indicies: torch.LongTensor,
+#     layer_idx: int,
+#     scale: float,
+#     *,
+#     buffer: Optional[torch.Tensor] = None,
+# ):
+#     """
+#     Semantics:
+#       y[i] += (
+#           x[i].unsqueeze(0)
+#           @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+#           @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+#           * scale
+#         ).squeeze(0)
+
+#     Args:
+#       y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+#       x: Shape: `[B, H1]`. Input vectors.
+#       wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+#         LoRA A matrices.
+#       wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+#         LoRA B matrices.
+#       indicies: Shape: `[B]`. Indices of the LoRA weights.
+#       layer_idx: Layer index of LoRA weights.
+#       scale: Scaling factor.
+#       buffer: Optional. Shape: `[B, R]`. Temporary buffer.
+#     """
+#     try:
+#         import vllm._punica_C as punica_kernels
+#     except ImportError as e:
+#         _raise_import_error(e)
+
+#     r = wb_t_all.size(-1)
+#     if buffer is None:
+#         # We set the buffer to be float32 by default to avoid
+#         # numerical inaccuracies that would otherwise happen
+#         # due to downcasting.
+#         buffer = torch.zeros((x.size(0), r),
+#                              dtype=torch.float32,
+#                              device=x.device)
+#     punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
+#     punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
+#                                  scale)
+
+
+
+
+
+# def add_lora_slice(
+#     y: torch.Tensor,
+#     x: torch.Tensor,
+#     wa_t_all: torch.Tensor,
+#     wb_t_all: torch.Tensor,
+#     indicies: torch.LongTensor,
+#     layer_idx: int,
+#     scale: float,
+#     y_offset: int,
+#     y_slice_size: int,
+#     *,
+#     buffer: Optional[torch.Tensor] = None,
+# ):
+#     """
+#     Same as `add_lora` but you can operate on slices of y.
+#     Pass whole y, define y_offset and y_slice_size.
+
+#     Semantics:
+#       y[i] += (
+#           x[i].unsqueeze(0)
+#           @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+#           @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+#           * scale
+#         ).squeeze(0)
+
+#     Args:
+#       y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+#       x: Shape: `[B, H1]`. Input vectors.
+#       wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+#         LoRA A matrices.
+#       wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+#         LoRA B matrices.
+#       indicies: Shape: `[B]`. Indices of the LoRA weights.
+#       layer_idx: Layer index of LoRA weights.
+#       scale: Scaling factor.
+#       y_offset: Offset to apply to the starting column of y.
+#       y_slice_size: Size of the y column slice.
+#     """
+#     try:
+#         import vllm._punica_C as punica_kernels
+#     except ImportError as e:
+#         _raise_import_error(e)
+
+#     r = wb_t_all.size(-1)
+#     if buffer is None:
+#         # We set the buffer to be float32 by default to avoid
+#         # numerical inaccuracies that would otherwise happen
+#         # due to downcasting.
+#         buffer = torch.zeros((x.size(0), r),
+#                              dtype=torch.float32,
+#                              device=x.device)
+#     punica_kernels.dispatch_bgmv_low_level(
+#         buffer,
+#         x,
+#         wa_t_all,
+#         indicies,
+#         layer_idx,
+#         1.0,
+#         x.size(1),
+#         buffer.size(1),
+#         0,
+#     )
+#     punica_kernels.dispatch_bgmv_low_level(
+#         y,
+#         buffer,
+#         wb_t_all,
+#         indicies,
+#         layer_idx,
+#         scale,
+#         buffer.size(1),
+#         y_slice_size,
+#         y_offset,
+#     )
 
 def add_lora_triton(
     y: torch.Tensor,
@@ -265,82 +342,6 @@ def _lora_bgmv(
                 batch_size,
                 add_inputs=True)
 
-
-def add_lora_slice(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-    y_offset: int,
-    y_slice_size: int,
-    *,
-    buffer: Optional[torch.Tensor] = None,
-):
-    """
-    Same as `add_lora` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
-
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
-    """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
-    punica_kernels.dispatch_bgmv_low_level(
-        buffer,
-        x,
-        wa_t_all,
-        indicies,
-        layer_idx,
-        1.0,
-        x.size(1),
-        buffer.size(1),
-        0,
-    )
-    punica_kernels.dispatch_bgmv_low_level(
-        y,
-        buffer,
-        wb_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        buffer.size(1),
-        y_slice_size,
-        y_offset,
-    )
-
-
 def add_lora_triton_slice(
     y: torch.Tensor,
     x: torch.Tensor,

From ef8e83a6bdae21b829eeda0e3406886c2751d49f Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 5 Jun 2024 13:25:02 +0800
Subject: [PATCH 16/71] fix bug

---
 vllm/lora/ops/bgmv_expand_slice.py |   5 +-
 vllm/lora/ops/sgmv_shrink.py       |   6 -
 vllm/lora/punica.py                | 438 ++++++++++++++---------------
 3 files changed, 221 insertions(+), 228 deletions(-)

diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 5e30312c7e18..262f7669e0a2 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -93,15 +93,14 @@ def bgmv_expand_slice(
     """
     Args:
         inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'a weight
+        lora_b_weights (torch.Tensor): lora'b weight
         output_tensor (torch.Tensor): output tensor
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
-        batchs (int): batch size
         slice_offst (int): output_tensor's offst
         slice_size (int): current output_tensor's size
+        batchs (int): batch size
         add_inputs (bool, optional): _description_. Defaults to False.
-        cast_type (bool, optional): _description_. Defaults to False.
     """
 
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 3dd48a8bafac..518cf70bbf12 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -144,12 +144,6 @@ def sgmv_shrink(
     assert output_tensor.is_contiguous()
     # TODO tuning this config
     N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
-    # BLOCK_M = config.get("BLOCK_M", 32)
-    # BLOCK_N = config.get("BLOCK_N", 32)
-    # BLOCK_K = config.get("BLOCK_K", 32)
-    # SPLIT_K = config.get("SPLIT_K", 16)
-    # num_warps = config.get("num_warps", 4)
-    # num_stages = config.get("num_stages", 3)
     BLOCK_M = 32
     BLOCK_N = 16
     BLOCK_K = 32
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 90ce268c903b..ad48abf9bb9c 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -12,225 +12,225 @@
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 
 
-# def _raise_import_error(e):
-#     if torch.cuda.get_device_capability() < (8, 0):
-#         raise ImportError(
-#             "punica LoRA kernels require compute capability >= 8.0") from e
-#     else:
-#         raise ImportError(
-#             "punica LoRA kernels could not be imported. If you built vLLM "
-#             "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-#             "was set.") from e
-
-
-# def bgmv(
-#     y: torch.Tensor,
-#     x: torch.Tensor,
-#     w_t_all: torch.Tensor,
-#     indicies: torch.LongTensor,
-#     layer_idx: int,
-#     scale: float,
-# ):
-#     """
-#     Semantics:
-#       y[i] += (
-#           x[i].unsqueeze(0)
-#           @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-#           * scale
-#         ).squeeze(0)
-
-#     Args:
-#       y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-#       x: Shape: `[B, H1]`. Input vectors.
-#       w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
-#         matrices.
-#       indicies: Shape: `[B]`. Indices of the weight matrices.
-#       layer_idx: Layer index of the weight matrices.
-#       scale: Scaling factor.
-#     """
-#     try:
-#         import vllm._punica_C as punica_kernels
-#     except ImportError as e:
-#         _raise_import_error(e)
-
-#     punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
-
-
-# def dispatch_bgmv_low_level(
-#     y: torch.Tensor,
-#     x: torch.Tensor,
-#     w_t_all: torch.Tensor,
-#     indicies: torch.LongTensor,
-#     layer_idx: int,
-#     scale: float,
-#     y_offset: int,
-#     y_slice_size: int,
-# ):
-#     """
-#     Same as `bgmv` but you can operate on slices of y.
-#     Pass whole y, define y_offset and y_slice_size.
-
-#     Semantics:
-#       y[i] += (
-#           x[i].unsqueeze(0)
-#           @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-#           * scale
-#         ).squeeze(0)
-
-#     Args:
-#       y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-#       x: Shape: `[B, H1]`. Input vectors.
-#       w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of
-#         all of the transposed LoRA matrices.
-#       indicies: Shape: `[B]`. Indices of the LoRA weights.
-#       layer_idx: Layer index of LoRA weights.
-#       scale: Scaling factor.
-#       y_offset: Offset to apply to the starting column of y.
-#       y_slice_size: Size of the y column slice.
-#     """
-#     try:
-#         import vllm._punica_C as punica_kernels
-#     except ImportError as e:
-#         _raise_import_error(e)
-#     punica_kernels.dispatch_bgmv_low_level(
-#         y,
-#         x,
-#         w_t_all,
-#         indicies,
-#         layer_idx,
-#         scale,
-#         x.size(1),
-#         y_slice_size,
-#         y_offset,
-#     )
-
-
-# def add_lora(
-#     y: torch.Tensor,
-#     x: torch.Tensor,
-#     wa_t_all: torch.Tensor,
-#     wb_t_all: torch.Tensor,
-#     indicies: torch.LongTensor,
-#     layer_idx: int,
-#     scale: float,
-#     *,
-#     buffer: Optional[torch.Tensor] = None,
-# ):
-#     """
-#     Semantics:
-#       y[i] += (
-#           x[i].unsqueeze(0)
-#           @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-#           @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-#           * scale
-#         ).squeeze(0)
-
-#     Args:
-#       y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-#       x: Shape: `[B, H1]`. Input vectors.
-#       wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-#         LoRA A matrices.
-#       wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-#         LoRA B matrices.
-#       indicies: Shape: `[B]`. Indices of the LoRA weights.
-#       layer_idx: Layer index of LoRA weights.
-#       scale: Scaling factor.
-#       buffer: Optional. Shape: `[B, R]`. Temporary buffer.
-#     """
-#     try:
-#         import vllm._punica_C as punica_kernels
-#     except ImportError as e:
-#         _raise_import_error(e)
-
-#     r = wb_t_all.size(-1)
-#     if buffer is None:
-#         # We set the buffer to be float32 by default to avoid
-#         # numerical inaccuracies that would otherwise happen
-#         # due to downcasting.
-#         buffer = torch.zeros((x.size(0), r),
-#                              dtype=torch.float32,
-#                              device=x.device)
-#     punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-#     punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
-#                                  scale)
-
-
-
-
-
-# def add_lora_slice(
-#     y: torch.Tensor,
-#     x: torch.Tensor,
-#     wa_t_all: torch.Tensor,
-#     wb_t_all: torch.Tensor,
-#     indicies: torch.LongTensor,
-#     layer_idx: int,
-#     scale: float,
-#     y_offset: int,
-#     y_slice_size: int,
-#     *,
-#     buffer: Optional[torch.Tensor] = None,
-# ):
-#     """
-#     Same as `add_lora` but you can operate on slices of y.
-#     Pass whole y, define y_offset and y_slice_size.
-
-#     Semantics:
-#       y[i] += (
-#           x[i].unsqueeze(0)
-#           @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-#           @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-#           * scale
-#         ).squeeze(0)
-
-#     Args:
-#       y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-#       x: Shape: `[B, H1]`. Input vectors.
-#       wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-#         LoRA A matrices.
-#       wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-#         LoRA B matrices.
-#       indicies: Shape: `[B]`. Indices of the LoRA weights.
-#       layer_idx: Layer index of LoRA weights.
-#       scale: Scaling factor.
-#       y_offset: Offset to apply to the starting column of y.
-#       y_slice_size: Size of the y column slice.
-#     """
-#     try:
-#         import vllm._punica_C as punica_kernels
-#     except ImportError as e:
-#         _raise_import_error(e)
-
-#     r = wb_t_all.size(-1)
-#     if buffer is None:
-#         # We set the buffer to be float32 by default to avoid
-#         # numerical inaccuracies that would otherwise happen
-#         # due to downcasting.
-#         buffer = torch.zeros((x.size(0), r),
-#                              dtype=torch.float32,
-#                              device=x.device)
-#     punica_kernels.dispatch_bgmv_low_level(
-#         buffer,
-#         x,
-#         wa_t_all,
-#         indicies,
-#         layer_idx,
-#         1.0,
-#         x.size(1),
-#         buffer.size(1),
-#         0,
-#     )
-#     punica_kernels.dispatch_bgmv_low_level(
-#         y,
-#         buffer,
-#         wb_t_all,
-#         indicies,
-#         layer_idx,
-#         scale,
-#         buffer.size(1),
-#         y_slice_size,
-#         y_offset,
-#     )
+def _raise_import_error(e):
+    if torch.cuda.get_device_capability() < (8, 0):
+        raise ImportError(
+            "punica LoRA kernels require compute capability >= 8.0") from e
+    else:
+        raise ImportError(
+            "punica LoRA kernels could not be imported. If you built vLLM "
+            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
+            "was set.") from e
+
+
+def bgmv(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+):
+    """
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
+        matrices.
+      indicies: Shape: `[B]`. Indices of the weight matrices.
+      layer_idx: Layer index of the weight matrices.
+      scale: Scaling factor.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
+
+
+def dispatch_bgmv_low_level(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+    y_offset: int,
+    y_slice_size: int,
+):
+    """
+    Same as `bgmv` but you can operate on slices of y.
+    Pass whole y, define y_offset and y_slice_size.
+
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of
+        all of the transposed LoRA matrices.
+      indicies: Shape: `[B]`. Indices of the LoRA weights.
+      layer_idx: Layer index of LoRA weights.
+      scale: Scaling factor.
+      y_offset: Offset to apply to the starting column of y.
+      y_slice_size: Size of the y column slice.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+    punica_kernels.dispatch_bgmv_low_level(
+        y,
+        x,
+        w_t_all,
+        indicies,
+        layer_idx,
+        scale,
+        x.size(1),
+        y_slice_size,
+        y_offset,
+    )
+
+
+def add_lora(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+    *,
+    buffer: Optional[torch.Tensor] = None,
+):
+    """
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+        LoRA A matrices.
+      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+        LoRA B matrices.
+      indicies: Shape: `[B]`. Indices of the LoRA weights.
+      layer_idx: Layer index of LoRA weights.
+      scale: Scaling factor.
+      buffer: Optional. Shape: `[B, R]`. Temporary buffer.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default to avoid
+        # numerical inaccuracies that would otherwise happen
+        # due to downcasting.
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
+    punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
+    punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
+                                 scale)
+
+
+
+
+
+def add_lora_slice(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+    y_offset: int,
+    y_slice_size: int,
+    *,
+    buffer: Optional[torch.Tensor] = None,
+):
+    """
+    Same as `add_lora` but you can operate on slices of y.
+    Pass whole y, define y_offset and y_slice_size.
+
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+        LoRA A matrices.
+      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+        LoRA B matrices.
+      indicies: Shape: `[B]`. Indices of the LoRA weights.
+      layer_idx: Layer index of LoRA weights.
+      scale: Scaling factor.
+      y_offset: Offset to apply to the starting column of y.
+      y_slice_size: Size of the y column slice.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default to avoid
+        # numerical inaccuracies that would otherwise happen
+        # due to downcasting.
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
+    punica_kernels.dispatch_bgmv_low_level(
+        buffer,
+        x,
+        wa_t_all,
+        indicies,
+        layer_idx,
+        1.0,
+        x.size(1),
+        buffer.size(1),
+        0,
+    )
+    punica_kernels.dispatch_bgmv_low_level(
+        y,
+        buffer,
+        wb_t_all,
+        indicies,
+        layer_idx,
+        scale,
+        buffer.size(1),
+        y_slice_size,
+        y_offset,
+    )
 
 def add_lora_triton(
     y: torch.Tensor,

From f75ce8686ca933421d8d28932fba347b0bed3ffe Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 6 Jun 2024 00:28:53 +0800
Subject: [PATCH 17/71] optimize kernel

---
 vllm/lora/ops/bgmv_expand.py       | 61 +++++++++++++++++------------
 vllm/lora/ops/bgmv_expand_slice.py | 62 +++++++++++++++++-------------
 vllm/lora/ops/sgmv_shrink.py       |  2 +-
 3 files changed, 73 insertions(+), 52 deletions(-)

diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 6132b6047997..888fa537a7c4 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -28,6 +28,7 @@ def _bgmv_expand_kernel(
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
     SPLIT_N: tl.constexpr,
+    EVEN_K: tl.constexpr,
     ADD_INPUTS: tl.constexpr,
     CAST_TYPE: tl.constexpr,
 ):
@@ -41,41 +42,49 @@ def _bgmv_expand_kernel(
         return
     offset_k = tl.arange(0, BLOCK_K)
     offset_n = tl.arange(0, BLOCK_N)
-    tiled_a = tl.load(
-        input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
-        mask=offset_k < K,
-        other=0,
-    )  # [BLOCK_K]
+    if EVEN_K:
+        tiled_a = tl.load(
+            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+        )  # [BLOCK_K]
+    else:
+        tiled_a = tl.load(
+            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+            mask=offset_k < K,
+            other=0,
+        )  # [BLOCK_K]
 
     split_n_length = tl.cdiv(N, SPLIT_N)
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
     # sliding  to  next row-block
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             pid_sn * split_n_length * lora_k_stride)
+    b_ptr = (
+        lora_ptr
+        + l0_stride * lora_index
+        + pid_sn * split_n_length * lora_k_stride
+    )
+    c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
     for n in range(0, split_n_length, BLOCK_N):
         current_n = n + offset_n
         # vector load
         current_n_c = tl.max_contiguous(current_n, BLOCK_N)
-        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
-                                                              < K)
-
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (
+            offset_k[None, :] < K
+        )
+        c_mask = current_n < split_n_length
         tiled_b = tl.load(
-            b_ptr + current_n_c[:, None] * lora_k_stride +
-            offset_k[None, :] * lora_n_stride,
+            b_ptr
+            + current_n_c[:, None] * lora_k_stride
+            + offset_k[None, :] * lora_n_stride,
             mask=b_ptr_mask,
             other=0.0,
         )  # [BLOCK_N,BLOCK_K]
-
-        accumulator = tl.sum(tiled_a * tiled_b, 1)
-
-        c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
-                 current_n * cn_stride)
-        c_mask = current_n < split_n_length
         if ADD_INPUTS:
-            tiled_out = tl.load(c_ptr, mask=c_mask)
-            accumulator += tiled_out
-        tl.store(c_ptr, accumulator, mask=c_mask)
+            tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask)
+            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
+        else:
+            accumulator = tl.sum(tiled_a * tiled_b, 1)
+
+        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
 
 
 @torch.inference_mode()
@@ -122,14 +131,15 @@ def bgmv_expand(
     # TODO tuning this config
 
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_N = 512
+    BLOCK_N = 128
     BLOCK_K = triton.next_power_of_2(K)
-    SPLIT_N = 8
+    SPLIT_N = 128
+    EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-            torch.float16,
-            torch.bfloat16,
+        torch.float16,
+        torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = [
@@ -153,6 +163,7 @@ def bgmv_expand(
         BLOCK_N,
         BLOCK_K,
         SPLIT_N,
+        EVEN_K,
         ADD_INPUTS,
         CAST_TYPE,
     )
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 262f7669e0a2..a8fb5719ab95 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -29,6 +29,7 @@ def _bgmv_expand_slice_kernel(
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
     SPLIT_N: tl.constexpr,
+    EVEN_K: tl.constexpr,
     ADD_INPUTS: tl.constexpr,
     CAST_TYPE: tl.constexpr,
 ):
@@ -42,41 +43,48 @@ def _bgmv_expand_slice_kernel(
         return
     offset_k = tl.arange(0, BLOCK_K)
     offset_n = tl.arange(0, BLOCK_N)
-    tiled_a = tl.load(
-        input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
-        mask=offset_k < K,
-        other=0,
-    )  # [BLOCK_K]
+    if EVEN_K:
+        tiled_a = tl.load(
+            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+        )  # [BLOCK_K]
+    else:
+        tiled_a = tl.load(
+            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
+            mask=offset_k < K,
+            other=0,
+        )  # [BLOCK_K]
 
     split_n_length = tl.cdiv(N, SPLIT_N)
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
     # sliding  to  next row-block
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             pid_sn * split_n_length * lora_k_stride)
+    b_ptr = (
+        lora_ptr
+        + l0_stride * lora_index
+        + pid_sn * split_n_length * lora_k_stride
+    )
+    c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
     for n in range(0, split_n_length, BLOCK_N):
         current_n = n + offset_n
-        # vector load
-        current_n_c = tl.max_contiguous(current_n, BLOCK_N)
-        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
-                                                              < K)
-
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (
+            offset_k[None, :] < K
+        )
+        c_mask = current_n < split_n_length
         tiled_b = tl.load(
-            b_ptr + current_n_c[:, None] * lora_k_stride +
-            offset_k[None, :] * lora_n_stride,
+            b_ptr
+            + current_n[:, None] * lora_k_stride
+            + offset_k[None, :] * lora_n_stride,
             mask=b_ptr_mask,
             other=0.0,
         )  # [BLOCK_N,BLOCK_K]
 
-        accumulator = tl.sum(tiled_a * tiled_b, 1)
-
-        c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
-                 slice_offset * cn_stride + current_n * cn_stride)
-        c_mask = current_n < split_n_length
         if ADD_INPUTS:
-            tiled_out = tl.load(c_ptr, mask=c_mask)
-            accumulator += tiled_out
-        tl.store(c_ptr, accumulator, mask=c_mask)
+            tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask)
+            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
+        else:
+            accumulator = tl.sum(tiled_a * tiled_b, 1)
+
+        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
 
 
 @torch.inference_mode()
@@ -126,14 +134,15 @@ def bgmv_expand_slice(
     # TODO tuning this config
 
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_N = 512
+    BLOCK_N = 256
     BLOCK_K = triton.next_power_of_2(K)
-    SPLIT_N = 8
+    SPLIT_N = 128
+    EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-            torch.float16,
-            torch.bfloat16,
+        torch.float16,
+        torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = [
@@ -158,6 +167,7 @@ def bgmv_expand_slice(
         BLOCK_N,
         BLOCK_K,
         SPLIT_N,
+        EVEN_K,
         ADD_INPUTS,
         CAST_TYPE,
     )
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 518cf70bbf12..9fc7508c9421 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -148,7 +148,7 @@ def sgmv_shrink(
     BLOCK_N = 16
     BLOCK_K = 32
     SPLIT_K = 8
-    EVEN_K = K % BLOCK_K == 0
+    EVEN_K = K % (BLOCK_K*SPLIT_K) == 0
     grid = [
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         SPLIT_K,

From c0bc06a4e3554207eba7577f9856b1989b9c69cc Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 11 Jun 2024 21:18:18 +0800
Subject: [PATCH 18/71] trigger test

---
 tests/lora/test_triton_punica.py   | 431 +++++++++++++++++++----------
 vllm/lora/layers.py                |  95 ++++---
 vllm/lora/models.py                |   2 +-
 vllm/lora/ops/bgmv_expand.py       |  36 +--
 vllm/lora/ops/bgmv_expand_slice.py |  38 ++-
 vllm/lora/ops/bgmv_shrink.py       |  19 +-
 vllm/lora/ops/sgmv_expand.py       |   5 +-
 vllm/lora/ops/sgmv_expand_slice.py |   3 +-
 vllm/lora/ops/sgmv_shrink.py       |  13 +-
 vllm/lora/punica.py                |  73 ++---
 10 files changed, 442 insertions(+), 273 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 6aea2573d962..8f28821a9336 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -1,14 +1,14 @@
-import random
-
 import pytest
 import torch
 
+import vllm._punica_C as punica_kernels
 import vllm.lora.punica as punica
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
 # The current punica kernel supports dimension and adds a dimension of 3424.
 HIDDEN_SIZES = [
@@ -62,20 +62,26 @@
     128000,
     128256,
 ]
-BATCHS = [i for i in range(0, 128, 8)]
+
+_BATCH_SIZE_ALIGNMENT = 8
+
+# vllm support batch size
+BATCHS = [1, 2, 4] + [_BATCH_SIZE_ALIGNMENT * i for i in range(1, 8)]
+
 NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256]
-DTYPES = [torch.half, torch.bfloat16, torch.float32]
+DTYPES = [torch.float16,torch.bfloat16]
 MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
 SCALES = [0.5]
 OP_TYPES = ["shrink", "expand"]
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
+NSLICES = [2, 3]
 
 
 def assert_close(a, b):
     rtol, atol = {
-        torch.float16: (1e-2, 1e-2),
-        torch.bfloat16: (12e-2, 1e-2),
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
         torch.float32: (1e-2, 1e-2),
     }[a.dtype]
     torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
@@ -101,7 +107,7 @@ def _torch_groupgemm(
     out_list = []
     current_offset = 0
     for lora_index, b_length in zip(range(batchs), seq_len_tensor):
-        input_weight = inputs[current_offset:b_length + current_offset, :]
+        input_weight = inputs[current_offset : b_length + current_offset, :]
         current_offset += b_length
         lora_weight = lora_weights[lora_indices_tensor[lora_index]]
         result = torch.nn.functional.linear(input_weight, lora_weight)
@@ -115,27 +121,29 @@ def _torch_groupgemm(
     return
 
 
-def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
-                   op_type, device):
+def _generate_data(
+    batchs, hidden_size, lora_nums, max_rank, max_length, dtype, op_type, device
+):
     if max_length == 1:
         max_length += 1
-    seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device)
+    seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
     ).to(device)
     total_tokens = seq_len_tensor.sum()
     if op_type == "shrink":
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
+        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(
+            device
+        )
         lora_weights = torch.rand(
             (lora_nums, max_rank, hidden_size),  # col-major
             dtype=dtype,
         ).to(device)
         # shrink op need atomic_add, so output is initinized by 0
-        ref_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=dtype,
-                                     device=inputs_tensor.device)
+        ref_out_tensor = torch.zeros(
+            (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
+        )
         # NOTE  shrink kernel using torch.float32 as output type
         our_out_tensor = torch.zeros(
             (total_tokens, max_rank),
@@ -161,15 +169,16 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
         # Ensure the same input.
         our_out_tensor = ref_out_tensor.clone()
 
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batchs, )).to(device)
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,)
+    ).to(device)
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batchs):
         lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = lora_index.item()
+        indices[
+            current_offset : current_offset + seq_len_tensor[b_id]
+        ] = lora_index.item()
         current_offset += seq_len_tensor[b_id].item()
     return (
         inputs_tensor,
@@ -183,89 +192,145 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
     )
 
 
-# @pytest.mark.skip("work in progress")
-@pytest.mark.parametrize("batchs", BATCHS)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", OP_TYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sgmv_torch(
-    batchs: int,
-    num_loras: int,
-    rank: int,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
+def _generate_data_expand_nslices(
+    batchs, hidden_size, lora_nums, max_rank, max_length, dtype, nslices, device
 ):
-    torch.manual_seed(seed)
-    if batchs == 0:
-        batchs += 1
-    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
-    hidden_size = HIDDEN_SIZES[hidden_size_index]
-    if hidden_size > 100000:
-        hidden_size = hidden_size // 4  # avoid OOM
-    (
+    if max_length == 1:
+        max_length += 1
+    seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+
+    inputs_tensor = torch.rand(
+        (total_tokens, max_rank),
+        dtype=dtype,
+    ).to(device)
+    lora_weights_lst = []
+    for _ in range(nslices):
+        lora_weights_lst.append(
+            torch.rand(
+                (lora_nums, hidden_size, max_rank),  # col-major
+                dtype=dtype,
+            ).to(device)
+        )
+    # expand op needs to complete y+=a@lora_b, so output is
+    # initinized randomly
+    ref_out_tensor = torch.rand(
+        (total_tokens, hidden_size * nslices),
+        dtype=dtype,
+        device=inputs_tensor.device,
+    )
+    # Ensure the same input.
+    our_out_tensor = ref_out_tensor.clone()
+
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,)
+    ).to(device)
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batchs):
+        lora_index = lora_indices_tensor[b_id]
+        indices[
+            current_offset : current_offset + seq_len_tensor[b_id]
+        ] = lora_index.item()
+        current_offset += seq_len_tensor[b_id].item()
+    return (
         inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
         our_out_tensor,
         ref_out_tensor,
         b_seq_start_loc,
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(
-        batchs, hidden_size, num_loras, rank, 1024, dtype, op_type,
-        device)  # The sequence length is restricted to the range [1, 1024].
-    max_seq_length = seq_len_tensor.max()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batchs,
-            max_seq_length,
-            scaling,
-        )
-    else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batchs,
-            max_seq_length,
-            add_inputs=True,
-        )
-    _torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batchs,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
     )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
 
 
-# @pytest.mark.skip("work in progress")
+# @pytest.mark.parametrize("batchs", BATCHS)
+# @pytest.mark.parametrize("num_loras", NUM_LORA)
+# @pytest.mark.parametrize("rank", MAX_RANKS)
+# @pytest.mark.parametrize("scaling", SCALES)
+# @pytest.mark.parametrize("dtype", DTYPES)
+# @pytest.mark.parametrize("op_type", OP_TYPES)
+# @pytest.mark.parametrize("seed", SEED)
+# @pytest.mark.parametrize("device", CUDA_DEVICES)
+# def test_sgmv_torch(
+#     batchs: int,
+#     num_loras: int,
+#     rank: int,
+#     scaling: float,
+#     dtype: torch.dtype,
+#     op_type: str,
+#     seed: int,
+#     device: str,
+# ):
+#     torch.manual_seed(seed)
+#     torch.set_default_device(device)
+#     if batchs == 0:
+#         batchs += 1
+#     hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
+#     hidden_size = HIDDEN_SIZES[hidden_size_index]
+#     if hidden_size > 100000:
+#         hidden_size = hidden_size // 4  # avoid OOM
+#     (
+#         inputs_tensor,
+#         lora_weights,
+#         our_out_tensor,
+#         ref_out_tensor,
+#         b_seq_start_loc,
+#         lora_indices_tensor,
+#         seq_len_tensor,
+#         indices,
+#     ) = _generate_data(
+#         batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, device
+#     )  # The sequence length is restricted to the range [1, 1024].
+#     max_seq_length = seq_len_tensor.max()
+#     if isinstance(max_seq_length, tuple):
+#         max_seq_length = max_seq_length[0].item()
+#     else:
+#         max_seq_length = max_seq_length.item()
+#     if op_type == "shrink":
+#         sgmv_shrink(
+#             inputs_tensor,
+#             lora_weights,
+#             our_out_tensor,
+#             b_seq_start_loc,
+#             seq_len_tensor,
+#             lora_indices_tensor,
+#             batchs,
+#             max_seq_length,
+#             scaling,
+#         )
+#     else:
+#         sgmv_expand(
+#             inputs_tensor,
+#             lora_weights,
+#             our_out_tensor,
+#             b_seq_start_loc,
+#             seq_len_tensor,
+#             lora_indices_tensor,
+#             batchs,
+#             max_seq_length,
+#             add_inputs=True,
+#         )
+#     _torch_groupgemm(
+#         ref_out_tensor,
+#         inputs_tensor,
+#         lora_weights,
+#         lora_indices_tensor,
+#         seq_len_tensor,
+#         batchs,
+#         scaling if op_type == "shrink" else 1.0,
+#         op_type,
+#     )
+#     if op_type == "shrink":
+#         ref_out_tensor = ref_out_tensor.to(torch.float32)
+#     assert_close(our_out_tensor, ref_out_tensor)
+
+
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -284,9 +349,10 @@ def test_triton_sgmv_punica_bgmv(
     if dtype == torch.float32 or hidden_size == 3424:
         return
     torch.manual_seed(seed)
+    torch.set_default_device(device)
     batchs = 4  # Arbitrary values for testing
-    rank = 16
-    seq_len = 333  # Arbitrary values for testing
+    rank = 16  # Arbitrary values for testing
+    seq_len = 128  # Arbitrary values for testing
     num_loras = 8  # Arbitrary values for testing
     (
         inputs_tensor,
@@ -297,8 +363,9 @@ def test_triton_sgmv_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
-                       op_type, device)
+    ) = _generate_data(
+        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
+    )
 
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
@@ -362,6 +429,7 @@ def test_triton_bgmv_punica_bgmv(
     if dtype == torch.float32 or hidden_size == 3424:
         return
     torch.manual_seed(seed)
+    torch.set_default_device(device)
     if batchs == 0:
         batchs += 1
     rank = 16
@@ -376,8 +444,9 @@ def test_triton_bgmv_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
-                       op_type, device)
+    ) = _generate_data(
+        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
+    )
 
     if op_type == "shrink":
         bgmv_shrink(
@@ -409,18 +478,17 @@ def test_triton_bgmv_punica_bgmv(
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
 
-
-@pytest.mark.skip("work in progress")
+@pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", NSLICES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sgmv_expand_nslice(
-    hidden_size,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
+def test_sgmv_expand_slice(
+    batchs:int,
+    hidden_size: int,
+    nslices: int,
+    dtype: str,
     seed: int,
     device: str,
 ):
@@ -428,59 +496,140 @@ def test_sgmv_expand_nslice(
     if dtype == torch.float32 or hidden_size == 3424:
         return
     torch.manual_seed(seed)
-    batchs = 4  # Arbitrary values for testing
-    rank = 16
-    seq_len = 333  # Arbitrary values for testing
-    num_loras = 8  # Arbitrary values for testing
+    torch.set_default_device(device)
+    max_rank = 16
+    lora_nums = 4
+    max_length = 128
     (
         inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
+        lora_weights_lst,
+        our_outputs,
+        ref_outputs,
         b_seq_start_loc,
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
-                       op_type, device)
-
+    ) = _generate_data_expand_nslices(
+        batchs,
+        hidden_size,
+        lora_nums,
+        max_rank,
+        max_length,
+        dtype,
+        nslices,
+        device,
+    )
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
         max_seq_length = max_seq_length[0].item()
     else:
         max_seq_length = max_seq_length.item()
+    slice_offset = 0
+    for index in range(nslices):
+        lora_weights = lora_weights_lst[index]
+        sgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            our_outputs,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            slice_offset,
+            hidden_size,
+            add_inputs=True,
+        )
+        lora_weights_4d = lora_weights.unsqueeze(dim=1)
+        punica_kernels.dispatch_bgmv_low_level(
+            ref_outputs,
+            inputs_tensor,
+            lora_weights_4d,
+            indices,
+            0,
+            1.0,
+            inputs_tensor.size(1),
+            hidden_size,
+            slice_offset,
+        )
+        slice_offset += hidden_size
+    assert_close(our_outputs, ref_outputs)
 
-    sgmv_expand_slice(
+@pytest.mark.parametrize("batchs", BATCHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("nslices", NSLICES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_bgmv_expand_slice(
+    batchs:int,
+    hidden_size: int,
+    nslices: int,
+    dtype: str,
+    seed: int,
+    device: str,
+):
+    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
+    if dtype == torch.float32 or hidden_size == 3424:
+        return
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+    max_rank = 64
+    lora_nums = 8
+    (
         inputs_tensor,
-        lora_weights,
-        our_out_tensor,
+        lora_weights_lst,
+        our_outputs,
+        ref_outputs,
         b_seq_start_loc,
-        seq_len_tensor,
         lora_indices_tensor,
-        batchs,
-        max_seq_length,
-        1024,
-        add_inputs=True,
-    )
-    lora_weights_4d = lora_weights.unsqueeze(dim=1)
-    _punica_bgmv(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights_4d,
+        seq_len_tensor,
         indices,
-        scaling if op_type == "shrink" else 1.0,
+    ) = _generate_data_expand_nslices(
+        batchs,
+        hidden_size,
+        lora_nums,
+        max_rank,
+        1,
+        dtype,
+        nslices,
+        device,
     )
-
-    assert_close(our_out_tensor, ref_out_tensor)
+    slice_offset = 0
+    for index in range(nslices):
+        lora_weights = lora_weights_lst[index]
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            our_outputs,
+            lora_indices_tensor,
+            slice_offset,
+            hidden_size,
+            batchs,
+            add_inputs=True,
+        )
+        lora_weights_4d = lora_weights.unsqueeze(dim=1)
+        punica_kernels.dispatch_bgmv_low_level(
+            ref_outputs,
+            inputs_tensor,
+            lora_weights_4d,
+            lora_indices_tensor,
+            0,
+            1.0,
+            inputs_tensor.size(1),
+            hidden_size,
+            slice_offset,
+        )
+        slice_offset += hidden_size
+    assert_close(our_outputs, ref_outputs)
 
 
 if __name__ == "__main__":
-    test_triton_bgmv_punica_bgmv(
-        batchs=1,
-        hidden_size=128,
-        scaling=0.5,
-        dtype=torch.float16,
-        op_type="expand",
+    test_bgmv_expand_slice(
+        batchs=256,
+        hidden_size=3424,
+        nslices=2,
+        dtype=torch.bfloat16,
         seed=0,
         device="cuda:0",
     )
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index ff922a14d879..96b37ab8880c 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -16,9 +16,8 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-from vllm.lora.punica import (add_lora_triton,
-                              add_lora_triton_slice)
 from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.punica import add_lora_triton, add_lora_triton_slice
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -63,6 +62,7 @@ def dec(*args, **kwargs):
 
     return dec
 
+
 def _apply_lora_triton(
     x: torch.Tensor,
     lora_a_stacked: torch.Tensor,
@@ -72,30 +72,40 @@ def _apply_lora_triton(
     lora_index_tensor: torch.Tensor,
     batch_mlen_stage_lst: List[int],
     output: torch.Tensor,
-):
-    # """Applies lora to each input.
-
-    # This method applies all loras to each input. It uses the
-    # indices vector to determine which lora yields the
-    # correct output. An index of -1 means no lora should be
-    # applied. This method adds the final lora results to the
-    # output.
-
-    # Input shapes:
-    #     x:               (batch_size, hidden_dim)
-    #     lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
-    #     lora_b_stacked:  (num_loras, output_dim, lora_rank)
-    #     indices:         (batch_size)
-    #     output:          (batch_size, output_dim)
-    # """
+) -> torch.Tensor:
+    """Applies lora to each input.   This method applies all loras to each 
+    input. It uses the `lora_index_tensor` vector to determine which lora 
+    yields the correct output. An index of -1 means no lora should be
+    applied. This method adds the final lora results to the output.
+
+    Args:
+        x (torch.Tensor): (batch_size, hidden_dim)
+        lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim)
+        lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank)
+        b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4].
+        seq_length_tensor (torch.Tensor): batch_size,). record the sequence
+            length of the sequences in the batch
+        lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batch_mlen_stage_lst (List[int]): (3,).Sequentially represent batch
+            size, maximum seq length, and prefilling stage flag.
+        output (torch.Tensor):  (batch_size, output_dim)  
+        
+    Returns:
+        output (torch.Tensor):  (batch_size, output_dim)  
+        
+    """
     org_output = output
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
-
+    #
     batch_size = batch_mlen_stage_lst[0]
     max_length = batch_mlen_stage_lst[1]
     is_prefilling = bool(batch_mlen_stage_lst[2])
-    # maybe we need not  restrict  range to [:batch_size]
+
     add_lora_triton(output, x, lora_a_stacked, lora_b_stacked,
                     b_seq_start_tensor[:batch_size],
                     seq_length_tensor[:batch_size],
@@ -114,22 +124,31 @@ def _apply_lora_triton_nslice(
     batch_mlen_stage_lst: List[int],
     output: torch.Tensor,
     output_slices: Tuple[int, ...],
-):
-    # """Applies lora to each input.
-
-    # This method applies all loras to each input. It uses the
-    # indices vector to determine which lora yields the
-    # correct output. An index of -1 means no lora should be
-    # applied. This method adds the final lora results to the
-    # output.
-
-    # Input shapes:
-    #     x:               (batch_size, hidden_dim)
-    #     lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
-    #     lora_b_stacked:  (num_loras, output_dim, lora_rank)
-    #     indices:         (batch_size)
-    #     output:          (batch_size, output_dim)
-    # """
+) -> torch.Tensor:
+    """Applies lora to each input.  This method applies all loras to each 
+    input. It uses the `lora_index_tensor` vector to determine which lora 
+    yields the correct output. An index of -1 means no lora should be
+    applied. This method adds the final lora results to the output.
+
+    Args:
+        x (torch.Tensor): (batch_size, hidden_dim)
+        lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim)
+        lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank)
+        b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4].
+        seq_length_tensor (torch.Tensor): batch_size,). record the sequence
+            length of the sequences in the batch
+        lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batch_mlen_stage_lst (List[int]): (3,).Sequentially represent batch
+            size, maximum seq length, and prefilling stage flag.
+        output_slices (Tuple[int, ...]): Size of each output column 
+
+    Returns:
+        output (torch.Tensor):  (batch_size, output_dim) 
+    """
     org_output = output
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
@@ -1238,9 +1257,9 @@ def _get_logits(
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
 
-        batch_mlen_stage_lst=self.batch_mlen_stage_lst.copy()
+        batch_mlen_stage_lst = self.batch_mlen_stage_lst.copy()
         # LogitsProcessorWithLoRA always using bgmv
-        batch_mlen_stage_lst[2]=False
+        batch_mlen_stage_lst[2] = False
         _apply_lora_triton(hidden_states, self.lora_a_stacked,
                            self.lora_b_stacked, self.b_seq_start_tensor,
                            self.seq_length_tensor,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 438eeff1ff0c..b6c47e599e81 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -451,7 +451,7 @@ def __init__(
         # element contains batch_size, max_length, 0 or 1. Use 1 for the
         # prefilling stage and 0 for the decoding stage.The reason for
         # distinguishing between the prefilling and decoding stage is that
-        # if we have implemented bgmv, it can be utilized during the decoding
+        # we had implemented bgmv, it can be utilized during the decoding
         # stage.
         self.batch_mlen_stage_lst = [-1] * 3
         self._create_lora_modules()
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 888fa537a7c4..8ec26bdb6b83 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -5,9 +5,9 @@
 https://arxiv.org/abs/2310.18547
 """
 
+import torch
 import triton
 import triton.language as tl
-import torch
 
 
 @triton.jit
@@ -33,7 +33,8 @@ def _bgmv_expand_kernel(
     CAST_TYPE: tl.constexpr,
 ):
     """
-    C=A@B, and B is col-major matrix
+    GroupGEMV,Additionally, introducing SPLIT_N can improve large hidden_size's
+    performance
     """
     pid_sn = tl.program_id(axis=0)
     cur_batch = tl.program_id(axis=1)
@@ -43,9 +44,8 @@ def _bgmv_expand_kernel(
     offset_k = tl.arange(0, BLOCK_K)
     offset_n = tl.arange(0, BLOCK_N)
     if EVEN_K:
-        tiled_a = tl.load(
-            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
-        )  # [BLOCK_K]
+        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
+                          offset_k * xk_stride, )  # [BLOCK_K]
     else:
         tiled_a = tl.load(
             input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
@@ -57,24 +57,19 @@ def _bgmv_expand_kernel(
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
     # sliding  to  next row-block
-    b_ptr = (
-        lora_ptr
-        + l0_stride * lora_index
-        + pid_sn * split_n_length * lora_k_stride
-    )
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             pid_sn * split_n_length * lora_k_stride)
     c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
     for n in range(0, split_n_length, BLOCK_N):
         current_n = n + offset_n
         # vector load
         current_n_c = tl.max_contiguous(current_n, BLOCK_N)
-        b_ptr_mask = (current_n[:, None] < split_n_length) & (
-            offset_k[None, :] < K
-        )
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
+                                                              < K)
         c_mask = current_n < split_n_length
         tiled_b = tl.load(
-            b_ptr
-            + current_n_c[:, None] * lora_k_stride
-            + offset_k[None, :] * lora_n_stride,
+            b_ptr + current_n_c[:, None] * lora_k_stride +
+            offset_k[None, :] * lora_n_stride,
             mask=b_ptr_mask,
             other=0.0,
         )  # [BLOCK_N,BLOCK_K]
@@ -112,7 +107,6 @@ def bgmv_expand(
     assert lora_b_weights.dtype in [
         torch.float16,
         torch.bfloat16,
-        torch.float32,
     ]
     assert inputs.size(1) == lora_b_weights.size(-1)
 
@@ -131,15 +125,15 @@ def bgmv_expand(
     # TODO tuning this config
 
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_N = 128
+    BLOCK_N = 256
     BLOCK_K = triton.next_power_of_2(K)
-    SPLIT_N = 128
+    SPLIT_N = 64
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
+            torch.float16,
+            torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = [
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index a8fb5719ab95..3d41d064ea7c 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -5,9 +5,9 @@
 https://arxiv.org/abs/2310.18547
 """
 
+import torch
 import triton
 import triton.language as tl
-import torch
 
 
 @triton.jit
@@ -34,7 +34,8 @@ def _bgmv_expand_slice_kernel(
     CAST_TYPE: tl.constexpr,
 ):
     """
-    C=A@B, and B is col-major matrix
+    GroupGEMV,Additionally, introducing SPLIT_N can improve large hidden_size's
+    performance
     """
     pid_sn = tl.program_id(axis=0)
     cur_batch = tl.program_id(axis=1)
@@ -44,9 +45,8 @@ def _bgmv_expand_slice_kernel(
     offset_k = tl.arange(0, BLOCK_K)
     offset_n = tl.arange(0, BLOCK_N)
     if EVEN_K:
-        tiled_a = tl.load(
-            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
-        )  # [BLOCK_K]
+        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
+                          offset_k * xk_stride, )  # [BLOCK_K]
     else:
         tiled_a = tl.load(
             input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
@@ -58,22 +58,19 @@ def _bgmv_expand_slice_kernel(
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
     # sliding  to  next row-block
-    b_ptr = (
-        lora_ptr
-        + l0_stride * lora_index
-        + pid_sn * split_n_length * lora_k_stride
-    )
-    c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
+    b_ptr = (lora_ptr + l0_stride * lora_index +
+             pid_sn * split_n_length * lora_k_stride)
+    c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
+             slice_offset * cn_stride)
+
     for n in range(0, split_n_length, BLOCK_N):
         current_n = n + offset_n
-        b_ptr_mask = (current_n[:, None] < split_n_length) & (
-            offset_k[None, :] < K
-        )
+        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
+                                                              < K)
         c_mask = current_n < split_n_length
         tiled_b = tl.load(
-            b_ptr
-            + current_n[:, None] * lora_k_stride
-            + offset_k[None, :] * lora_n_stride,
+            b_ptr + current_n[:, None] * lora_k_stride +
+            offset_k[None, :] * lora_n_stride,
             mask=b_ptr_mask,
             other=0.0,
         )  # [BLOCK_N,BLOCK_K]
@@ -115,7 +112,6 @@ def bgmv_expand_slice(
     assert lora_b_weights.dtype in [
         torch.float16,
         torch.bfloat16,
-        torch.float32,
     ]
     assert inputs.size(1) == lora_b_weights.size(-1)
     assert lora_indices_tensor.size(0) == batchs
@@ -136,13 +132,13 @@ def bgmv_expand_slice(
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
     BLOCK_N = 256
     BLOCK_K = triton.next_power_of_2(K)
-    SPLIT_N = 128
+    SPLIT_N = 64
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
     if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
+            torch.float16,
+            torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = [
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index 5495e6f54353..d2166a5593ab 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -5,9 +5,9 @@
 https://arxiv.org/abs/2310.18547
 """
 
+import torch
 import triton
 import triton.language as tl
-import torch
 
 
 @triton.jit
@@ -30,6 +30,10 @@ def _bgmv_shrink_kernel(
     BLOCK_K: tl.constexpr,
     SPLIT_K: tl.constexpr,
 ):
+    """
+    GroupGEMV,Additionally, introducing SPLIT-K can improve large hidden_size's
+    performance
+    """
     pid_sk = tl.program_id(axis=0)
     cur_batch = tl.program_id(axis=1)
     lora_index = tl.load(lora_indices + cur_batch)
@@ -81,7 +85,6 @@ def bgmv_shrink(
     scaling: float,
 ):
     """
-
     Args:
         inputs (torch.Tensor): input tensor
         lora_a_weights (torch.Tensor): lora'a weight
@@ -92,7 +95,11 @@ def bgmv_shrink(
         scaling (float):  Scaling factor.
     """
     assert inputs.dtype == lora_a_weights.dtype
-    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    assert lora_a_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
     assert inputs.size(1) == lora_a_weights.size(-1)
     assert lora_indices_tensor.size(0) == batchs
     assert inputs.is_contiguous()
@@ -106,14 +113,13 @@ def bgmv_shrink(
     assert output_tensor.is_contiguous()
     # TODO tuning this config
     N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
-    BLOCK_K = 512
+    BLOCK_K = 256
     BLOCK_N = triton.next_power_of_2(output_tensor.size(1))
-    SPLIT_K = 16
+    SPLIT_K = 64
     grid = [
         SPLIT_K,
         batchs,
     ]
-    config = {"num_stages": 4, "num_warps": 8}
     _bgmv_shrink_kernel[grid](
         inputs,
         lora_a_weights,
@@ -132,6 +138,5 @@ def bgmv_shrink(
         BLOCK_N,
         BLOCK_K,
         SPLIT_K,
-        **config,
     )
     return
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index f2af7be4ad62..f34eec0357bd 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -106,9 +106,7 @@ def sgmv_expand(
     max_seq_length: int,
     add_inputs: bool = False,
 ):
-    """_summary_
-
-
+    """
     Args:
         inputs (torch.Tensor): input tensor
         lora_b_weights (torch.Tensor): lora'a weight
@@ -132,7 +130,6 @@ def sgmv_expand(
     assert lora_b_weights.dtype in [
         torch.float16,
         torch.bfloat16,
-        torch.float32,
     ]
     assert inputs.size(1) == lora_b_weights.size(-1)
     assert b_seq_start_loc.size(0) == batchs
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 72ed81bcbbd3..25975c7ed5fb 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -60,7 +60,7 @@ def _sgmv_expand_slice_kernel(
     offset_k = tl.arange(0, BLOCK_K)
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-
+    
     a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
              offset_k[None, :] * xk_stride, )
     b_ptr = (lora_ptr + l0_stride * lora_index +
@@ -140,7 +140,6 @@ def sgmv_expand_slice(
     assert lora_b_weights.dtype in [
         torch.float16,
         torch.bfloat16,
-        torch.float32,
     ]
     assert inputs.size(1) == lora_b_weights.size(-1)
     assert b_seq_start_loc.size(0) == batchs
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 9fc7508c9421..45aeb9e9fb78 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -118,7 +118,7 @@ def sgmv_shrink(
         b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
             sequence lengths of the sequences in the batch, used to index
             into sequence. E.g.,if the sequence length is [4, 6], it is
-            [0, 4, 10].
+            [0, 4].
         seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
             length of the sequences  in the batch
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
@@ -129,7 +129,11 @@ def sgmv_shrink(
         scaling (float):  Scaling factor.
     """
     assert inputs.dtype == lora_a_weights.dtype
-    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    assert lora_a_weights.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ]
     assert inputs.size(1) == lora_a_weights.size(-1)
     assert b_seq_start_loc.size(0) == batchs
     assert lora_indices_tensor.size(0) == batchs
@@ -148,16 +152,13 @@ def sgmv_shrink(
     BLOCK_N = 16
     BLOCK_K = 32
     SPLIT_K = 8
-    EVEN_K = K % (BLOCK_K*SPLIT_K) == 0
+    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
     grid = [
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         SPLIT_K,
         batchs,
     ]
 
-    # grid = lambda META: (triton.cdiv(max_seq_length, META[
-    #     'BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),META[
-    #     'SPLIT_K'],batchs)
     _sgmv_shrink_kernel[grid](
         inputs,
         lora_a_weights,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index ad48abf9bb9c..7a07e73a116c 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,12 +4,12 @@
 
 import torch
 
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
-from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
 
 def _raise_import_error(e):
@@ -155,9 +155,6 @@ def add_lora(
                                  scale)
 
 
-
-
-
 def add_lora_slice(
     y: torch.Tensor,
     x: torch.Tensor,
@@ -232,6 +229,7 @@ def add_lora_slice(
         y_offset,
     )
 
+
 def add_lora_triton(
     y: torch.Tensor,
     x: torch.Tensor,
@@ -248,11 +246,42 @@ def add_lora_triton(
     *,
     buffer: Optional[torch.Tensor] = None,
 ):
+    """ 
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[lora_index_tensor[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[lora_index_tensor[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+    Args:
+        y (torch.Tensor):  (batch_size, output_dim).Will be changed in-place.
+        x (torch.Tensor):  (batch_size, hidden_dim)
+        wa_t_all (torch.Tensor):  (num_loras, lora_rank, hidden_dim)
+        wb_t_all (torch.Tensor): (num_loras, output_dim, lora_rank)
+        b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative
+            sequence lengths of the sequences in the batch, used to index
+            into sequence. E.g.,if the sequence length is [4, 6], it is
+            [0, 4]. Used only during the prefilling stage.
+        seq_length_tensor (torch.Tensor): batch_size,). record the sequence
+            length of the sequences in the batch. Used only during the 
+            prefilling stage.
+        lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index
+            corresponding to each batch
+        batch_size (int): batch size. Used only during the prefilling stage.
+        max_length (int):  maximum seq length in the batch.Used only during the 
+            prefilling stage.
+        layer_idx (int): Layer index of LoRA weights.
+        scale (float):  Scaling factor.
+        is_prefilling (bool): True indicates the prefilling stage, while False 
+        indicates the decoding stage."
+        buffer (Optional[torch.Tensor], optional): (batch_size,rank) 
+    """
     r = wb_t_all.size(-1)
     if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
+        # We set the buffer to be float32 by default ,refer to:
+        # https://github.com/triton-lang/triton/issues/1387
+
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
                              device=x.device)
@@ -342,6 +371,7 @@ def _lora_bgmv(
                 batch_size,
                 add_inputs=True)
 
+
 def add_lora_triton_slice(
     y: torch.Tensor,
     x: torch.Tensor,
@@ -361,30 +391,9 @@ def add_lora_triton_slice(
     buffer: Optional[torch.Tensor] = None,
 ):
     """
-    Same as `add_lora` but you can operate on slices of y.
+    Same as `add_lora_triton` but you can operate on slices of y.
     Pass whole y, define y_offset and y_slice_size.
-
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
-    #"""
+    """
     # try:
     #     import vllm._punica_C as punica_kernels
     # except ImportError as e:

From a7b53708bf50886e1030810bf4145538d6b9e8a3 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 13 Jun 2024 14:21:42 +0800
Subject: [PATCH 19/71] tuning bgmv

---
 tests/lora/test_triton_punica.py   |  79 +++++++++-----------
 vllm/lora/ops/bgmv_expand.py       |  35 +++++----
 vllm/lora/ops/bgmv_expand_slice.py |  37 ++++++----
 vllm/lora/ops/bgmv_shrink.py       |  27 ++++---
 vllm/lora/ops/sgmv_expand_slice.py |   2 +-
 vllm/lora/ops/utils.py             |  57 +++++++++++++++
 vllm/lora/punica.py                | 114 ++++++++++++++++++-----------
 7 files changed, 226 insertions(+), 125 deletions(-)
 create mode 100644 vllm/lora/ops/utils.py

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 8f28821a9336..a098aba16456 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -69,7 +69,7 @@
 BATCHS = [1, 2, 4] + [_BATCH_SIZE_ALIGNMENT * i for i in range(1, 8)]
 
 NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256]
-DTYPES = [torch.float16,torch.bfloat16]
+DTYPES = [torch.float16, torch.bfloat16]
 MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
 SCALES = [0.5]
 OP_TYPES = ["shrink", "expand"]
@@ -107,7 +107,7 @@ def _torch_groupgemm(
     out_list = []
     current_offset = 0
     for lora_index, b_length in zip(range(batchs), seq_len_tensor):
-        input_weight = inputs[current_offset : b_length + current_offset, :]
+        input_weight = inputs[current_offset:b_length + current_offset, :]
         current_offset += b_length
         lora_weight = lora_weights[lora_indices_tensor[lora_index]]
         result = torch.nn.functional.linear(input_weight, lora_weight)
@@ -121,29 +121,27 @@ def _torch_groupgemm(
     return
 
 
-def _generate_data(
-    batchs, hidden_size, lora_nums, max_rank, max_length, dtype, op_type, device
-):
+def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
+                   op_type, device):
     if max_length == 1:
         max_length += 1
-    seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device)
+    seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
     ).to(device)
     total_tokens = seq_len_tensor.sum()
     if op_type == "shrink":
-        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(
-            device
-        )
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
         lora_weights = torch.rand(
             (lora_nums, max_rank, hidden_size),  # col-major
             dtype=dtype,
         ).to(device)
         # shrink op need atomic_add, so output is initinized by 0
-        ref_out_tensor = torch.zeros(
-            (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
-        )
+        ref_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=dtype,
+                                     device=inputs_tensor.device)
         # NOTE  shrink kernel using torch.float32 as output type
         our_out_tensor = torch.zeros(
             (total_tokens, max_rank),
@@ -169,16 +167,15 @@ def _generate_data(
         # Ensure the same input.
         our_out_tensor = ref_out_tensor.clone()
 
-    lora_indices_tensor = torch.randint(
-        0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,)
-    ).to(device)
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batchs, )).to(device)
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batchs):
         lora_index = lora_indices_tensor[b_id]
-        indices[
-            current_offset : current_offset + seq_len_tensor[b_id]
-        ] = lora_index.item()
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = lora_index.item()
         current_offset += seq_len_tensor[b_id].item()
     return (
         inputs_tensor,
@@ -192,12 +189,11 @@ def _generate_data(
     )
 
 
-def _generate_data_expand_nslices(
-    batchs, hidden_size, lora_nums, max_rank, max_length, dtype, nslices, device
-):
+def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
+                                  max_length, dtype, nslices, device):
     if max_length == 1:
         max_length += 1
-    seq_len_tensor = torch.randint(1, max_length, (batchs,)).to(device)
+    seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
@@ -214,8 +210,7 @@ def _generate_data_expand_nslices(
             torch.rand(
                 (lora_nums, hidden_size, max_rank),  # col-major
                 dtype=dtype,
-            ).to(device)
-        )
+            ).to(device))
     # expand op needs to complete y+=a@lora_b, so output is
     # initinized randomly
     ref_out_tensor = torch.rand(
@@ -226,16 +221,15 @@ def _generate_data_expand_nslices(
     # Ensure the same input.
     our_out_tensor = ref_out_tensor.clone()
 
-    lora_indices_tensor = torch.randint(
-        0, lora_nums - 1 if lora_nums > 1 else 1, (batchs,)
-    ).to(device)
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batchs, )).to(device)
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batchs):
         lora_index = lora_indices_tensor[b_id]
-        indices[
-            current_offset : current_offset + seq_len_tensor[b_id]
-        ] = lora_index.item()
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = lora_index.item()
         current_offset += seq_len_tensor[b_id].item()
     return (
         inputs_tensor,
@@ -363,9 +357,8 @@ def test_triton_sgmv_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(
-        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
-    )
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+                       op_type, device)
 
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
@@ -444,9 +437,8 @@ def test_triton_bgmv_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(
-        batchs, hidden_size, num_loras, rank, seq_len, dtype, op_type, device
-    )
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+                       op_type, device)
 
     if op_type == "shrink":
         bgmv_shrink(
@@ -454,7 +446,6 @@ def test_triton_bgmv_punica_bgmv(
             lora_weights,
             our_out_tensor,
             lora_indices_tensor,
-            batchs,
             scaling,
         )
     else:
@@ -463,7 +454,6 @@ def test_triton_bgmv_punica_bgmv(
             lora_weights,
             our_out_tensor,
             lora_indices_tensor,
-            batchs,
             add_inputs=True,
         )
     lora_weights_4d = lora_weights.unsqueeze(dim=1)
@@ -478,6 +468,7 @@ def test_triton_bgmv_punica_bgmv(
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
 
+
 @pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", NSLICES)
@@ -485,7 +476,7 @@ def test_triton_bgmv_punica_bgmv(
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sgmv_expand_slice(
-    batchs:int,
+    batchs: int,
     hidden_size: int,
     nslices: int,
     dtype: str,
@@ -555,6 +546,7 @@ def test_sgmv_expand_slice(
         slice_offset += hidden_size
     assert_close(our_outputs, ref_outputs)
 
+
 @pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", NSLICES)
@@ -562,7 +554,7 @@ def test_sgmv_expand_slice(
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_bgmv_expand_slice(
-    batchs:int,
+    batchs: int,
     hidden_size: int,
     nslices: int,
     dtype: str,
@@ -604,8 +596,7 @@ def test_bgmv_expand_slice(
             our_outputs,
             lora_indices_tensor,
             slice_offset,
-            hidden_size,
-            batchs,
+            slice_size=hidden_size,
             add_inputs=True,
         )
         lora_weights_4d = lora_weights.unsqueeze(dim=1)
@@ -626,8 +617,8 @@ def test_bgmv_expand_slice(
 
 if __name__ == "__main__":
     test_bgmv_expand_slice(
-        batchs=256,
-        hidden_size=3424,
+        batchs=32,
+        hidden_size=128,
         nslices=2,
         dtype=torch.bfloat16,
         seed=0,
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 8ec26bdb6b83..04fdd670243d 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -8,6 +8,8 @@
 import torch
 import triton
 import triton.language as tl
+from typing import Dict, Optional
+from .utils import get_lora_op_configs
 
 
 @triton.jit
@@ -88,8 +90,8 @@ def bgmv_expand(
     lora_b_weights: torch.Tensor,
     output_tensor: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batchs: int,
-    add_inputs: bool = False,
+    add_inputs: bool = True,
+    override_config: Optional[Dict[str, int]] = None,
 ):
     """
     Args:
@@ -110,7 +112,6 @@ def bgmv_expand(
     ]
     assert inputs.size(1) == lora_b_weights.size(-1)
 
-    assert lora_indices_tensor.size(0) == batchs
     assert inputs.is_contiguous()
     assert output_tensor.is_contiguous()
 
@@ -125,9 +126,9 @@ def bgmv_expand(
     # TODO tuning this config
 
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_N = 256
+    # BLOCK_N =64
     BLOCK_K = triton.next_power_of_2(K)
-    SPLIT_N = 64
+    # SPLIT_N = 8
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
@@ -136,10 +137,17 @@ def bgmv_expand(
             torch.bfloat16,
     ]:
         CAST_TYPE = True
-    grid = [
-        SPLIT_N,
+    config = {"BLOCK_N": 64, "SPLIT_N": 8}
+    batchs = lora_indices_tensor.size(0)
+
+    if override_config:
+        config = override_config
+    else:
+        config = get_lora_op_configs("expand", batchs, N)
+    grid = lambda META: (
+        META["SPLIT_N"],
         batchs,
-    ]
+    )
     _bgmv_expand_kernel[grid](
         inputs,
         lora_b_weights,
@@ -154,11 +162,10 @@ def bgmv_expand(
         lora_b_weights.stride(2),
         output_tensor.stride(0),
         output_tensor.stride(1),
-        BLOCK_N,
-        BLOCK_K,
-        SPLIT_N,
-        EVEN_K,
-        ADD_INPUTS,
-        CAST_TYPE,
+        BLOCK_K=BLOCK_K,
+        EVEN_K=EVEN_K,
+        ADD_INPUTS=ADD_INPUTS,
+        CAST_TYPE=CAST_TYPE,
+        **config,
     )
     return
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 3d41d064ea7c..becaf4f1ca07 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -8,6 +8,8 @@
 import torch
 import triton
 import triton.language as tl
+from typing import Any, Dict, Optional
+from .utils import get_lora_op_configs
 
 
 @triton.jit
@@ -92,8 +94,8 @@ def bgmv_expand_slice(
     lora_indices_tensor: torch.Tensor,
     slice_offset: int,
     slice_size: int,
-    batchs: int,
-    add_inputs: bool = False,
+    add_inputs: bool = True,
+    override_config: Optional[Dict[str, int]] = None,
 ):
     """
     Args:
@@ -114,7 +116,7 @@ def bgmv_expand_slice(
         torch.bfloat16,
     ]
     assert inputs.size(1) == lora_b_weights.size(-1)
-    assert lora_indices_tensor.size(0) == batchs
+
     assert slice_size == lora_b_weights.size(-2)
     assert inputs.is_contiguous()
     assert output_tensor.is_contiguous()
@@ -130,9 +132,9 @@ def bgmv_expand_slice(
     # TODO tuning this config
 
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_N = 256
+    # BLOCK_N = 256
     BLOCK_K = triton.next_power_of_2(K)
-    SPLIT_N = 64
+    # SPLIT_N = 64
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
@@ -141,10 +143,18 @@ def bgmv_expand_slice(
             torch.bfloat16,
     ]:
         CAST_TYPE = True
-    grid = [
-        SPLIT_N,
+
+    batchs = lora_indices_tensor.size(0)
+
+    if override_config:
+        config = override_config
+    else:
+        config = get_lora_op_configs("expand", batchs, N)
+
+    grid = lambda META: (
+        META["SPLIT_N"],
         batchs,
-    ]
+    )
     _bgmv_expand_slice_kernel[grid](
         inputs,
         lora_b_weights,
@@ -160,11 +170,10 @@ def bgmv_expand_slice(
         output_tensor.stride(0),
         output_tensor.stride(1),
         slice_offset,
-        BLOCK_N,
-        BLOCK_K,
-        SPLIT_N,
-        EVEN_K,
-        ADD_INPUTS,
-        CAST_TYPE,
+        BLOCK_K=BLOCK_K,
+        EVEN_K=EVEN_K,
+        ADD_INPUTS=ADD_INPUTS,
+        CAST_TYPE=CAST_TYPE,
+        **config,
     )
     return
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index d2166a5593ab..99b9d7ee5b9f 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -8,6 +8,8 @@
 import torch
 import triton
 import triton.language as tl
+from typing import Dict, Optional
+from .utils import get_lora_op_configs
 
 
 @triton.jit
@@ -81,8 +83,8 @@ def bgmv_shrink(
     lora_a_weights: torch.Tensor,
     output_tensor: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batchs: int,
-    scaling: float,
+    scaling: float = 1.0,
+    override_config: Optional[Dict[str, int]] = None,
 ):
     """
     Args:
@@ -101,7 +103,6 @@ def bgmv_shrink(
         torch.bfloat16,
     ]
     assert inputs.size(1) == lora_a_weights.size(-1)
-    assert lora_indices_tensor.size(0) == batchs
     assert inputs.is_contiguous()
 
     if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
@@ -112,14 +113,19 @@ def bgmv_shrink(
     assert lora_a_weights.is_contiguous()
     assert output_tensor.is_contiguous()
     # TODO tuning this config
+    batchs = lora_indices_tensor.size(0)
     N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
-    BLOCK_K = 256
     BLOCK_N = triton.next_power_of_2(output_tensor.size(1))
-    SPLIT_K = 64
-    grid = [
-        SPLIT_K,
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        config = get_lora_op_configs("shrink", batchs, K)
+
+    grid = lambda META: (
+        META["SPLIT_K"],
         batchs,
-    ]
+    )
     _bgmv_shrink_kernel[grid](
         inputs,
         lora_a_weights,
@@ -135,8 +141,7 @@ def bgmv_shrink(
         lora_a_weights.stride(2),
         output_tensor.stride(0),
         output_tensor.stride(1),
-        BLOCK_N,
-        BLOCK_K,
-        SPLIT_K,
+        BLOCK_N=BLOCK_N,
+        **config,
     )
     return
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 25975c7ed5fb..2fdedd591032 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -60,7 +60,7 @@ def _sgmv_expand_slice_kernel(
     offset_k = tl.arange(0, BLOCK_K)
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-    
+
     a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
              offset_k[None, :] * xk_stride, )
     b_ptr = (lora_ptr + l0_stride * lora_index +
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
new file mode 100644
index 000000000000..fb8110b90564
--- /dev/null
+++ b/vllm/lora/ops/utils.py
@@ -0,0 +1,57 @@
+import functools
+import json
+import os
+from typing import Dict, Optional
+
+
+def _get_config_file_name(
+    op_type: str,
+    batchs: int,
+    hidden_size: int,
+) -> str:
+    # device_name = torch.cuda.get_device_name().replace(" ", "_")
+    device_name = "NVIDIA_GeForce_RTX_3090"
+    return (
+        f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} "
+        + f"device_name={device_name}.json"
+    )
+
+
+@functools.lru_cache
+def _get_op_configs(
+    op_type: str, batch: int, hidden_size: int
+) -> Optional[Dict[str, int]]:
+    FOLDER_NAME = "bgmv_configs"
+    json_file_name = _get_config_file_name(op_type, batch, hidden_size)
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        FOLDER_NAME,
+        json_file_name,
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            tuned_config = json.load(f).get(
+                f"batchs={batch},hidden_size={hidden_size}", None
+            )
+            return tuned_config
+
+    # If no optimized configuration is available, return None
+    return None
+
+
+def _get_default_config(op_type: str, batch: int, hidden_size: int):
+    if op_type == "expand":
+        return {"BLOCK_N": 256, "SPLIT_N": 8, "num_warps": 8}
+    else:
+        return {"BLOCK_K": 32, "SPLIT_K": 64, "num_warps": 8}
+    # raise NotImplementedError
+
+
+def get_lora_op_configs(
+    op_type: str, batch: int, hidden_size: int
+) -> Dict[str, int]:
+    config = _get_op_configs(op_type, batch, hidden_size)
+    if not config:
+        config = _get_default_config(op_type, batch, hidden_size)
+    return config
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 7a07e73a116c..ba387fc2010f 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -246,7 +246,7 @@ def add_lora_triton(
     *,
     buffer: Optional[torch.Tensor] = None,
 ):
-    """ 
+    """
     Semantics:
       y[i] += (
           x[i].unsqueeze(0)
@@ -264,18 +264,18 @@ def add_lora_triton(
             into sequence. E.g.,if the sequence length is [4, 6], it is
             [0, 4]. Used only during the prefilling stage.
         seq_length_tensor (torch.Tensor): batch_size,). record the sequence
-            length of the sequences in the batch. Used only during the 
+            length of the sequences in the batch. Used only during the
             prefilling stage.
         lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
         batch_size (int): batch size. Used only during the prefilling stage.
-        max_length (int):  maximum seq length in the batch.Used only during the 
+        max_length (int):  maximum seq length in the batch.Used only during the
             prefilling stage.
         layer_idx (int): Layer index of LoRA weights.
         scale (float):  Scaling factor.
-        is_prefilling (bool): True indicates the prefilling stage, while False 
+        is_prefilling (bool): True indicates the prefilling stage, while False
         indicates the decoding stage."
-        buffer (Optional[torch.Tensor], optional): (batch_size,rank) 
+        buffer (Optional[torch.Tensor], optional): (batch_size,rank)
     """
     r = wb_t_all.size(-1)
     if buffer is None:
@@ -307,7 +307,6 @@ def add_lora_triton(
             wa_t_all,
             wb_t_all,
             lora_indices_tensor,
-            batch_size,
             layer_idx,
             scale,
             buffer=buffer,
@@ -358,18 +357,12 @@ def _lora_bgmv(
     wa_t_all: torch.Tensor,
     wb_t_all: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batch_size: int,
     layer_idx: int,
     scale: float,
     buffer: torch.Tensor,
 ):
-    bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, batch_size, scale)
-    bgmv_expand(buffer,
-                wb_t_all,
-                y,
-                lora_indices_tensor,
-                batch_size,
-                add_inputs=True)
+    bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, scale)
+    bgmv_expand(buffer, wb_t_all, y, lora_indices_tensor, add_inputs=True)
 
 
 def add_lora_triton_slice(
@@ -408,22 +401,53 @@ def add_lora_triton_slice(
                              dtype=torch.float32,
                              device=x.device)
     if is_prefilling:
-        _lora_sgmv_nslice(y, x, wa_t_all, wb_t_all, b_seq_start_tensor,
-                          seq_length_tensor, lora_indices_tensor, batch_size,
-                          max_length, layer_idx, scale, y_offset, y_slice_size,
-                          buffer)
+        _lora_sgmv_nslice(
+            y,
+            x,
+            wa_t_all,
+            wb_t_all,
+            b_seq_start_tensor,
+            seq_length_tensor,
+            lora_indices_tensor,
+            batch_size,
+            max_length,
+            layer_idx,
+            scale,
+            y_offset,
+            y_slice_size,
+            buffer,
+        )
     else:
-        _lora_bgmv_nslice(y, x, wa_t_all, wb_t_all, lora_indices_tensor,
-                          batch_size, layer_idx, scale, y_offset, y_slice_size,
-                          buffer)
+        _lora_bgmv_nslice(
+            y,
+            x,
+            wa_t_all,
+            wb_t_all,
+            lora_indices_tensor,
+            layer_idx,
+            scale,
+            y_offset,
+            y_slice_size,
+            buffer,
+        )
 
 
-def _lora_sgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor,
-                      wb_t_all: torch.Tensor, b_seq_start_tensor: torch.Tensor,
-                      seq_length_tensor: torch.Tensor,
-                      lora_indices_tensor: torch.Tensor, batch_size: int,
-                      max_length: int, layer_idx: int, scale: float,
-                      y_offset: int, y_slice_size: int, buffer):
+def _lora_sgmv_nslice(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    b_seq_start_tensor: torch.Tensor,
+    seq_length_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batch_size: int,
+    max_length: int,
+    layer_idx: int,
+    scale: float,
+    y_offset: int,
+    y_slice_size: int,
+    buffer,
+):
     sgmv_shrink(
         x,
         wa_t_all,
@@ -450,17 +474,25 @@ def _lora_sgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor,
     )
 
 
-def _lora_bgmv_nslice(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor,
-                      wb_t_all: torch.Tensor,
-                      lora_indices_tensor: torch.Tensor, batch_size: int,
-                      layer_idx: int, scale: float, y_offset: int,
-                      y_slice_size: int, buffer):
-    bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, batch_size, scale)
-    bgmv_expand_slice(buffer,
-                      wb_t_all,
-                      y,
-                      lora_indices_tensor,
-                      y_offset,
-                      y_slice_size,
-                      batch_size,
-                      add_inputs=True)
+def _lora_bgmv_nslice(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    layer_idx: int,
+    scale: float,
+    y_offset: int,
+    y_slice_size: int,
+    buffer,
+):
+    bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, scale)
+    bgmv_expand_slice(
+        buffer,
+        wb_t_all,
+        y,
+        lora_indices_tensor,
+        y_offset,
+        y_slice_size,
+        add_inputs=True,
+    )

From dc72d7ab821b8c4434d2d8192a400125da12433a Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 13 Jun 2024 14:32:03 +0800
Subject: [PATCH 20/71] add tuning config

---
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 +++++++
 vllm/lora/ops/utils.py                                     | 3 +--
 1151 files changed, 8051 insertions(+), 2 deletions(-)
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 create mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json

diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3b39ab85d9b3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=1024": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..926c453330ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=10240": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3403b6a8a156
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=102400": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..00a40f0fb282
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=102656": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fe2ef151f545
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=11008": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e72812a699b8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=1152": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6af79154d137
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=128": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ede75bf5ee4a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=1280": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7c18b5d9e89a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=128000": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..74123059d34c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=128256": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e6204367ba9f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=13824": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..619d49755fbd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=14336": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..da168958d44b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=1536": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b774e5e73509
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=15360": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5df0d12a0066
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=2048": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e669eec80db8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=22016": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6f248613276a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=2304": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4b800fceca15
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=24576": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..97ef5bd49850
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=256": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..498985dfa565
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=2560": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..888779c1a242
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=27392": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..90c40f66516a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=2752": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..725987ef135d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=27648": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6758c49d6d53
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=3072": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..739073148751
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=32000": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3ba9089734f5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=32256": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..74d73ee28866
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=32512": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cd011852520e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=32768": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bdb74bae1096
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=33024": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..90067aae86ea
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=3328": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..266f0b4643a2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=3456": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..50eef61c7dc0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=3584": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1a0e6bad928a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=36864": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6379489182c5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=4096": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..845e90469c7a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=43264": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6da3f2cdd17e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=4608": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5859f692c3b2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=49152": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a1ed01126386
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..20e62377ef27
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=5120": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cfd9a3f149ff
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=5504": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7e7ff82dc5f3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=5632": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6312c21225d5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=6144": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7bda71f1c3e4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=6400": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c39485cbc08c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=64000": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b1adfbe01c2c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=64256": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0b1aee061aa7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=6848": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3a894b412e3f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=6912": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0132c4375421
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=7168": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9ffe008aa83e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=8192": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..838189dba35d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=9216": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5b1da4d44b94
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=1024": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c392909217f5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=10240": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2e160c4ae390
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=102400": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..46428cc0a9da
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=102656": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..34ff5ebb9fe7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=11008": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1375324c09ad
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=1152": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e8b0e9dbe8b0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=128": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..72fc4afd1efe
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=1280": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..97e7d9e7bd0d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=128000": {
+        "BLOCK_N": 1024,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..40a4a9526be0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=128256": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ac35eea6297a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=13824": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..38b1819b0120
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=14336": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..668669e9fb4a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=1536": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c23e4b555ab4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=15360": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..30715168cdd8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=2048": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..050f3384e1cf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=22016": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3820959d0032
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=2304": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..08d8f70e1e7a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=24576": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5cf06550f0b1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=256": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5ee401212495
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=2560": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1d7db0c6a860
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=27392": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..77fc2358208c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=2752": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3b33817c6ecb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=27648": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d62a622342b5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=3072": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c7030ad5a673
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=32000": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ff76f3c110b9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=32256": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..721b587a948d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=32512": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..92932b62f1a1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=32768": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7cf1394d96bd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=33024": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bec43f2e9cd1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=3328": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8dfd12024faa
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=3456": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c9fa0757f4d2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=3584": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2323a50dfb84
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=36864": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..41e170807720
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=4096": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b04da877902c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=43264": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0df3ef025f97
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=4608": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..07e41596ed86
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=49152": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9013302be01a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..520a85f2e70a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=5120": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..11a3940a9d4a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=5504": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..43f4baa91a71
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=5632": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2a5260ec1d4d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=6144": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..962399539ec2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=6400": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cbc8f93ce329
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=64000": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..95e76f479321
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=64256": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b967d91645ed
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=6848": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6c2f971176df
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=6912": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e0203c01009e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=7168": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0db797564e0d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=8192": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ab2faa8a3e47
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=9216": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..99d36f108d24
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=1024": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5d1797c7df6a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=10240": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3c5a379e0bdc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=102400": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..16f2497bed72
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=102656": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..de9477263adf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=11008": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..58b67d1eb450
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=1152": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3b20b74c6b65
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=128": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..711133cabf41
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=1280": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8ae36e752fa0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=128000": {
+        "BLOCK_N": 1024,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9bd2e640b741
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=128256": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e9ef08e28930
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=13824": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..02df6114edd3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=14336": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f174cccf6781
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=1536": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..95398b063d5a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=15360": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a9eb1222067d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=2048": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b0376ebedcf0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=22016": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..33a78ee55501
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=2304": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5d62f3bc77ad
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=24576": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bb04a656ec58
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=256": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..76589c2e4848
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=2560": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c4f0e1fe7d02
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=27392": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4e3e51669b1a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=2752": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..76507320d8c9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=27648": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..95aefbac204c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=3072": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..804c9a7df946
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=32000": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8c4069b4c997
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=32256": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0de6a013f40e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=32512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fbc4f954e962
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=32768": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..89dff508fcff
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=33024": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fb48ebaa8f79
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=3328": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7603143e1ff2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=3456": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f119d16aa81c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=3584": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b19e8f096df7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=36864": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e870cd0967a0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=4096": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b30bd7d45e40
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=43264": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..29ab8038c085
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=4608": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..39faabbdede5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=49152": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..87cad481bbef
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ec7d7b1e6d69
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=5120": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..779014c6a48d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=5504": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e4e1610914d4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=5632": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..91e760ed29e7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=6144": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d1dba65beeb5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=6400": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..63cd02986d0e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=64000": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..53bd1e2a033d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=64256": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ceaea00d1ad4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=6848": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f7fe5d732ec5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=6912": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d026e12311a7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=7168": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1ce097bb563d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=8192": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f22a1513a6a4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=9216": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dc055ce5a023
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=1024": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fec5aa8a43a5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=10240": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f76e21dcf101
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=102400": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..02e01a88f229
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=102656": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ca13ab17631b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=11008": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..221b7046a42d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=1152": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..04e2fee606e4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=128": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f3e8e4c95080
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=1280": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..92d8dc48ef21
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=128000": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..deddd170e828
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=128256": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d8dba1076582
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=13824": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..585c1d33ce0d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=14336": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3a2668c5d3fd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=1536": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ac9189fdf242
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=15360": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9cc54aa3ceae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=2048": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a49ce276482b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=22016": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b66638610a4a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=2304": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6a6f0780ee68
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=24576": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..64cdc015d4f5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=256": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d768b5944d32
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=2560": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e1af0bc7d4a2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=27392": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..de80b48b9e46
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=2752": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2f1cc53d9d07
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=27648": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5399d322bca2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=3072": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ebe63936e73e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=32000": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d0af75521d17
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=32256": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ecb601665b16
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=32512": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0a95531a6226
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=32768": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e927a860d646
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=33024": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..64757e015c63
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=3328": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..de0faf408e1f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=3456": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5c93deb397ab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=3584": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3ce9a9150319
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=36864": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..82f6e893c6ad
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=4096": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e21f857efe75
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=43264": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b41625baab6a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=4608": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9ce03febee07
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=49152": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..efecf1f371f9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b1a5db564eba
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=5120": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d105bc53b555
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=5504": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e042d730fde0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=5632": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..005e8480d530
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=6144": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b0ce387d16a7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=6400": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fe8d2347f8be
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=64000": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..19016d92afc8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=64256": {
+        "BLOCK_N": 1024,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c24e63d4b061
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=6848": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..621e45f3d647
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=6912": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1764f42a3690
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=7168": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2c005c34c3b5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=8192": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7a560bf5b977
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=9216": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0fb0abbfd93b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=1024": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5089debbfefd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=10240": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..879a1d36c817
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=102400": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..eda15de61763
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=102656": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..10b2451c5f88
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=11008": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3ef065e97426
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=1152": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..26ecc87645eb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=128": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cea687cffe03
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=1280": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..50b07329f967
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=128000": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c9bc76d4e02d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=128256": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5e2bd08a4728
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=13824": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..95c19b2bafe1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=14336": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2a3648ab54f4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=1536": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d384ecbf3556
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=15360": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..017eb531ae10
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=2048": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..841202546c13
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=22016": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..78e4c0323585
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=2304": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1ac24aae29fe
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=24576": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8422904ffb26
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=256": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8a0eb3dd15c3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=2560": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0e433e6bed3d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=27392": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d6adf2ceb6b6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=2752": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bca14db36270
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=27648": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..985d62438445
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=3072": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e8464e7008ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=32000": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a47ee1a7459e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=32256": {
+        "BLOCK_N": 1024,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..889bbdc9e7c8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=32512": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7f722c7cc4f2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=32768": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ee0493b17a74
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=33024": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..552f181d0e3a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=3328": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d2fb833b52b7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=3456": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6fd72f2cb7df
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=3584": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b566409ba7bc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=36864": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..02781cc3c3a3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=4096": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5eb93d119fd2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=43264": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..691bf7cf400d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=4608": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e6f417f1ce35
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=49152": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ecc4311c65c6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2f8e3bad9359
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=5120": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..28bb43c5cea0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=5504": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ffd213467c8b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=5632": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..43dd6d5c4f34
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=6144": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a73329612467
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=6400": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1537688252ba
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=64000": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bdd7de7b0544
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=64256": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..537e8a289957
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=6848": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5822a67015d7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=6912": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..091e7c378078
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=7168": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d0c8d9d533a7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=8192": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9b047851381d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=9216": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ba9a5daa8327
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=1024": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..49bcf2f569f7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=10240": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6285daf17f71
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=102400": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bd9ce93f682f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=102656": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..939967371660
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=11008": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..798594bfd3a5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=1152": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3fa0c0edbdfe
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=128": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..531e3e4accaa
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=1280": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9d597ca3ab45
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=128000": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7826d6de2043
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=128256": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4382dfac1232
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=13824": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0dc25b7a9c47
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=14336": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2cb628ac30d9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=1536": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d3ade6322fcf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=15360": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f743a190ff6c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=2048": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..13dc549b58a4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=22016": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dd31e03333ef
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=2304": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f873a2168d70
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=24576": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f654b1763c6b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=256": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c3d6c38da9ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=2560": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6ebeee44d74f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=27392": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4583c1bd2a74
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=2752": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..457cba2bb27a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=27648": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6c60fd9cf325
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=3072": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..12dff65ef5e3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=32000": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7e6bcdc82b12
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=32256": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5cef4c0639e2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=32512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d475b36f7b10
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=32768": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..62bd24b55325
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=33024": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b9d49f65f25c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=3328": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f13c7ea9fcdb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=3456": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3c8bb10faf54
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=3584": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..eeeed1d55f4c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=36864": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c1780da9065d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=4096": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..73265ea43e99
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=43264": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..95ef324ce999
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=4608": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a2861173e71e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=49152": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..03789328aa67
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2ecae2ab22ab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=5120": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7979c4049101
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=5504": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e11fdff5cf8b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=5632": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b5d4291484ac
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=6144": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..510e5bcdd8f9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=6400": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a7d9938f211d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=64000": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4b4d5715c4a6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=64256": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bdc940e4306d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=6848": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6ac8e567768d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=6912": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a5a8025c74e2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=7168": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4937a00c96b9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=8192": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..14d37e0e84da
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=9216": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0655aeaf04d4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=1024": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4da43af96a88
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=10240": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..02944f10112d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=102400": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..888cbee83cd0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=102656": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9b48040ae35d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=11008": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..17628098a876
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=1152": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ba77dfd4e745
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=128": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..afc038f82824
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=1280": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cfca3795cf0c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=128000": {
+        "BLOCK_N": 512,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..40205831e8c5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=128256": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..73480c2a2fb0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=13824": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0ba68ff88dd7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=14336": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1411944ed903
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=1536": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..52d4d6d866da
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=15360": {
+        "BLOCK_N": 1024,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5e8c8b03d807
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=2048": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ea2e2b703621
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=22016": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e07a33a9f890
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=2304": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1cd26dfc178c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=24576": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2101c81521e4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=256": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c6545c96c672
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=2560": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dee776ee0b6b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=27392": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b34f648169cf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=2752": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b90111549674
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=27648": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a07aa0c23f3e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=3072": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..70c8889ecf4f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=32000": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9050b5588db6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=32256": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..64a0a899f92c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=32512": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3fc62e336640
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=32768": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8bf15baf2d0e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=33024": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3b72a219f413
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=3328": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..555f04b52080
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=3456": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f5e814a48de6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=3584": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..aed09238da26
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=36864": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e32fcb1ffb63
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=4096": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..120a7ac01e61
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=43264": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3580fa47a62f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=4608": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5b7de710ac80
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=49152": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c25b01a3a2ee
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=512": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..39e20cdd3dc4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=5120": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6103c0f82883
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=5504": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..534348364229
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=5632": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..29cdb7e2b43b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=6144": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e5362a24e683
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=6400": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6f5a25fef3b2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=64000": {
+        "BLOCK_N": 256,
+        "SPLIT_N": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5cd46fba6793
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=64256": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7afa26abf9ed
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=6848": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c2fbb625f0cb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=6912": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..182c287a971c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=7168": {
+        "BLOCK_N": 64,
+        "SPLIT_N": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..543cfc5cf252
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=8192": {
+        "BLOCK_N": 128,
+        "SPLIT_N": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..209fd07020ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=9216": {
+        "BLOCK_N": 32,
+        "SPLIT_N": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e7807642e242
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d990a464aead
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=10240": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f8ba6f98e11d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=102400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2820c3819fb1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=102656": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8ff38bafd95b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=11008": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2239222c6721
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..76ebd9a6187b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=128": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..604d0f4a24f6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7b9522e223c9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..16ca324abb1d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..51fc783ba5be
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=13824": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2b5c9fb008a5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=14336": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1d6fdcc9e4cf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=1536": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0188952eabb5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=15360": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..748317facd1a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=2048": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..762c6902553f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=22016": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..50693dd027e9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=2304": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ce7f797b6501
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1fb7817d045a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..93cb002287b9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=2560": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..df245f7e4e3a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=27392": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c420bc28686a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=2752": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ec8253156d69
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=27648": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8eb952a2f70b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=3072": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5f4e26b927c4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6567393424e9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=32256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0de485aa91ed
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=32512": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..382626229f94
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=32768": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f339e51540b0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=33024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1c29335eaf49
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..164746c525a9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=3456": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..32f4a2527a5f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bf9fccfd9628
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=36864": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5afaf9370a75
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=4096": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..434e65d6010f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=43264": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5b05b7d563cb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=4608": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6ed84d41e515
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..57f7208d5b31
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a39eee775813
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..386dcefdd9ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=5504": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..404b2e3e7143
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=5632": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f0f167c7f637
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=6144": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d5caa9380998
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=6400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..03c28033d268
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=64000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e5055c7016d7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=64256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..64efb8751002
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=6848": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..090a44e33153
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=6912": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..47231de840f8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=7168": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d0b6943c5df2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=8192": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bcfe34389c8e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=1,hidden_size=9216": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9f2c62aa7bf3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f205785ac354
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3b1ae9af607c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=102400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..21f6c7c4d2c1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=102656": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..80432fa22304
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=11008": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..431749d79e07
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=1152": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e4cf65f2b466
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=128": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..456cdd0cbfa7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..657d4601e38d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3fdc882f8d4d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f16d8e73b04f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cdb9f921d9e2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a4a2728ea3f7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=1536": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e250ad59c685
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=15360": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1fe78e780ab8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=2048": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ea5d35272955
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=22016": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c24d3ebad8f6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=2304": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..60d16b61c097
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1ac220f8cdb4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..593b6236bedc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=2560": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e02f41487062
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=27392": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0e21dad2a1bb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=2752": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e7de2732db5d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=27648": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4594326f1214
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=3072": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..476b94614e61
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=32000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3394e0b2b2e2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=32256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..47b57eaba2d9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=32512": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2095dc93481f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=32768": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d7ae5539e292
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=33024": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..392502ed9bef
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=3328": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..caaf6dd953b3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=3456": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..34fa2b5fc43e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..71e95b2a3456
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1506da8d5b06
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=4096": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..82dd0f4c7d44
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=43264": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f97d81fa1054
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=4608": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..985c8781d3a0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7ca73c314f25
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1a63f1a720a0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=5120": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..022bfd82a54b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=5504": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9b22d0042659
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=5632": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4075a65a0a7c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=6144": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..eabfa5752d0a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=6400": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4c12481b8079
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=64000": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e30688c4abd2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2e29f14a92cd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..13ff36677840
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=6912": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8539aa72ab90
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=7168": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..53b66c493ea1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=8192": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a6dfe596884f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=104,hidden_size=9216": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..602ffc66510d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fafea8e3786d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=10240": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1eec6dc77df8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=102400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dedb0618406f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=102656": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..de1dcf78c395
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=11008": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..73bc95410c42
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c7834800f885
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=128": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..65868fe3b2f4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6c18618b318f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=128000": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a036606de7cf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..89df4c522fae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=13824": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a142ada1ad07
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=14336": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9bb33d368fef
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=1536": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3a988a776e6f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=15360": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a57d196ba0ae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=2048": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c6aa7bf7b99b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=22016": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..83c7c4e4c9a2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=2304": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..427c0106c6ab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=24576": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d47ff5bbb529
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6f1afccc1299
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=2560": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..61a1b944f16c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=27392": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6423b56c688d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=2752": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..aa5170de0248
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=27648": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ae0f48e917f2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=3072": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6897b5908a5b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=32000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4c40c7bfc193
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=32256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8b38bafedc79
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=32512": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..81d3642fe4ec
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d5263b43cc15
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=33024": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..35d23d34e4a7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d72e173b284f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=3456": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a6387b5fc49e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=3584": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d80742511305
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b27c32ff573f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=4096": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..af6c84968dea
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=43264": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6da56b5f85f0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=4608": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..aa2f1263faae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f3b5f24c791b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dde06af0de34
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2dc362e1e1d4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=5504": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..76853f696c19
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=5632": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d498238ea795
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=6144": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fc10d7616de9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=6400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..60aed7116c3d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=64000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..98bccfc678a1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..71bf22fc018b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0cbc42cfc86f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=6912": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ebab6f621840
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=7168": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7f2582d5fabb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=8192": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..37a66a2651be
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=16,hidden_size=9216": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cb914f7d052d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3cbfe81a8241
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..01318faf6258
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=102400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7201e38109c1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=102656": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d36bf9a65f9e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=11008": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..01a390c790a7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..068bf8ed0f4e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=128": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7172561ad91a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c39444441044
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0fe8be7dd9e4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=128256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..463a4228b8b6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1c566ee67de8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=14336": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bd612e59861b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=1536": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c6a18389e22b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=15360": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..de611d52747f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=2048": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7692a7d496a3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=22016": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..67533819f2b7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=2304": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e7df88ce3d48
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=24576": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d60a540f6c9f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d4f9c110fc67
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=2560": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..db4554d192bc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=27392": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e468805dd6bf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=2752": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ffbb85f3cc9d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=27648": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..77046010f9f9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=3072": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..101a1fde2e17
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..97ecc56f5a4b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=32256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f1f3fa79859c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=32512": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f26ad0121efa
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..08f2719e2a70
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=33024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..19650dbe5c76
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..43b03a77fccf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=3456": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5ea0fd9b7fb1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..97e4bf17acb5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=36864": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b201e52f3f1a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=4096": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d79ac16a16e0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=43264": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9d1bc041bfa0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=4608": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..af3fbc6aa009
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=49152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3e6ab8e331e8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3824f7dc6657
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=5120": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d1fd2e96658a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=5504": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d9642bf1e5bf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=5632": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..89e5b64ce96b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=6144": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2345b7e5e882
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=6400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6de6f1c12486
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=64000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b6c138a55f28
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4a63ed737b7a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ee7b427fbae6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=6912": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..345cb9192105
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=7168": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..91dabeb15527
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=8192": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b90e9970c8c0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=2,hidden_size=9216": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d750dbc94ebf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..422be89522eb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=10240": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a43fa1f140e9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=102400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..89e3dac0a31e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=102656": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8c75da6742a4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=11008": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..81d9eeae3f90
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..32c16964bada
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=128": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cd66a4abdfdd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6e95d75f0b7a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6ffa41368045
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=128256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0c993ab8ef41
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=13824": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4bef43ff2a51
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..382f702b15f5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=1536": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1b6fb8d0262c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=15360": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9f9ab7d1da6f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=2048": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2ca6015cc8e2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=22016": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c01c77b849ad
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=2304": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1ffc10aab8a9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=24576": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9c90c8e5d3d6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a4a5aff5f9bf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=2560": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bfad4944a155
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=27392": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f377b9487847
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=2752": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..294a45493519
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=27648": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5dd926e91002
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=3072": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7b41f769bfea
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2f376b9d3d99
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=32256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5a03b6ba2242
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=32512": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a093a839dd4e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..904257e85be9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=33024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4f41954d2c44
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=3328": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7b430bb3ef84
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=3456": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5545e8a9bb58
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=3584": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..76e30d5c6721
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=36864": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..60adb9b594de
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=4096": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..70fac02be5f4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=43264": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fac51807dd14
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=4608": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f8af00a3f0b2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=49152": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ec0716115b72
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5bfce900dd59
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..142a66db84ff
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=5504": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1e819ccf88b9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=5632": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..640dfe5c4aab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=6144": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3f9f49c856ae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=6400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ee67fafb3f47
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=64000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a4c7d1951a2d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=64256": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2668afddbf9d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..432191c74e6e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=6912": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..965794c29584
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=7168": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4099c8b31019
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=8192": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..54f120427642
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=24,hidden_size=9216": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bc0d77a48b76
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4d245dd8606c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=10240": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..15f2726ffb7f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=102400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..610e795d0889
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=102656": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..86f0bd139844
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=11008": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..525db2cd9c83
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f154c1ac63ae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=128": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4d9ca99f621b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4ce6fea19d94
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..61b5a2e19b2a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=128256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ee27b7eb7145
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=13824": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f1c0786c18a5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2e80cd18b7e3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=1536": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a7ebd2698a89
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=15360": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a70cbb63546d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=2048": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..411a6d693f37
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=22016": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7bf4485bdafb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=2304": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4b83542ed831
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fcdf01c82b6e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..27fa6e4ab908
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=2560": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f32c6f35e55b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=27392": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..97e139922a3b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=2752": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..633e173ede3d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=27648": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4b2853e27e6d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=3072": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dbca49ebf47f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bbbb809b4bac
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=32256": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2db20b0f523a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=32512": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b76c39dbc79c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=32768": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..95de274c392b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=33024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..31cc0f0988df
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=3328": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d5848f9c0dc2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=3456": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..928c4793a1b4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=3584": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..87333a2977e6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a13f1ae708cc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=4096": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e263c22b48c8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=43264": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..28ccb7922928
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=4608": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..74bdc063f829
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=49152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e65f07e0c1b7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..86b22822e193
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..068d658420a2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=5504": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bb22b22c0cf4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=5632": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..693e8b466480
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=6144": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..72e213c9c841
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=6400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8983a00a5dc0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=64000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7ebc1a433047
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=64256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..baf8710e7904
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=6848": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..83d82eab352e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=6912": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ea365d60b5f7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=7168": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e0e266cba149
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=8192": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..df672f000e51
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=32,hidden_size=9216": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..21b7f37b6f10
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=1024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6780bd1b6061
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c8d5d48f3067
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=102400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8092de1d055e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=102656": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..25352c1f541a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=11008": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..72a9af0df017
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=1152": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8b8a4d4ba0a6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=128": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5a63ccf5ea27
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=1280": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f5322f696374
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=128000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9c0b7751d2a2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3645b8d9bc6e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f8e1776dbeac
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..67a3fb9e41c7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=1536": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d145bf3a8058
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=15360": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..17bbd1c727ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=2048": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..50bb33d9ddc2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=22016": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..28de03a38564
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=2304": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e8696121d871
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f24f53814e53
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..414e8136350b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=2560": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d0a7f1ad50ca
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=27392": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..90b14d89c238
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=2752": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dfb4a2036591
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=27648": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..afcc971891fd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=3072": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..86627bb4c2b6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 256,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..18f2e8d01075
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=32256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..60216da8b12c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=32512": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3f3d7c814ec2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=32768": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..11f41c734aed
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=33024": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b7b78e643da8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dd301c41083a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=3456": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..00e16062556b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..90b71d6a48aa
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4567fabf9fa3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=4096": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d758bd2bd5d4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=43264": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4dd89a2baa91
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=4608": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4e2ea6dc099c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..571e5f11fb3a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..72fbed4df169
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cdebe9fa2d4d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=5504": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b3315ac27857
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=5632": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0f58f063add0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=6144": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..36435a35bd1f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=6400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a3d53ddfb20c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=64000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9ec3fad337d0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..48399c1da49f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..90a1a1dc7123
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=6912": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..490f4fe0958a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=7168": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7dfbd884de1e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=8192": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..423a0a8bb660
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=4,hidden_size=9216": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a23caa15d938
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=1024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..da975d0f7f04
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..765e06971801
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=102400": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ad02b596b368
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=102656": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6ea35e6b4067
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=11008": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3c5b43290284
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d1d0c2952eec
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=128": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7ac3b09650e6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7e364a12309c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..362a4794b89d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6daf2db30092
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d39676778929
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ca1458693a59
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=1536": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5caff317b920
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=15360": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bd333dbdea1b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=2048": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5b17698cf941
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=22016": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8c34a9ffe746
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=2304": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..628d5e31f820
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=24576": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..53510a4a3176
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..36e4a63d0806
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=2560": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e95f7c421e3c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=27392": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e9fde38aa4c1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=2752": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..675598e6916c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=27648": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d0c7c62a69f4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=3072": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d38a57c5cb63
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=32000": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fbf651ee7ab1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=32256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f56745eaf5d9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=32512": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c7e6acf8da4c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=32768": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e214f20d25dd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=33024": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c3a7b1b3d075
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..70c8b1a5cebe
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=3456": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7518d2514d3f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=3584": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8b8caf903283
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..24dfdeb31e27
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=4096": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..73974a4b5ec4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=43264": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..134f097ee092
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=4608": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..99fd65e94e97
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 256,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a085010d32df
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..50fa49ac53a0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3444417e1cc2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=5504": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..15f14f8a9a1d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=5632": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1c05dce95c3a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=6144": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..56833977b005
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=6400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bfa3f47fa2eb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=64000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4991337f39f4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ad9494f2d0ef
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..72ffcbf1889d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=6912": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c5ae6958ebdf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=7168": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bdd5edbfc87b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=8192": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..610d6175f27e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=40,hidden_size=9216": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2548ac5d500e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..987dc9b61dc2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..848a752c0379
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=102400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..faa00296ce21
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=102656": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f280fe287d2b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=11008": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..19ddc1d36abf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a7f53ede4c6d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=128": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d3c33217c882
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a8fb320c7b01
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=128000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e0a7c154a14f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=128256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c9b76cf2ea95
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=13824": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1fcf12b7eb96
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c335578c3dab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=1536": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b9f810c13912
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=15360": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..215df482ca64
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=2048": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..86e68481e14c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=22016": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ee1aa6e358cf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=2304": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f00777cd606a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0382b4bfcad6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0cd5ac59ca34
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=2560": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3e5825452ae7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=27392": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e9ebb39aaa52
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=2752": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0951d6150b20
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=27648": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4f6ec8b3fcfc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=3072": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0dc115f3588e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=32000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..55b7954dd82a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=32256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..786022725aab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=32512": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a80125409b55
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c2c745a5b6ab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=33024": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e8803b5163ed
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=3328": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d3ee96ee2125
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=3456": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0ba06679888a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..db6774f5f21c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a7ffc7684eaf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=4096": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6f775a7eaa65
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=43264": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..29275d15516d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=4608": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c1f3159e79ad
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=49152": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..43d2c867eb78
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3ef158578c25
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2fa3f9b2dd5a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=5504": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3b7df5022eed
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=5632": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5150a9505d59
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=6144": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..15bd83f652b2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=6400": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..be3441337184
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=64000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5adbe1354608
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4d0c8340f94e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=6848": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f25843901a29
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=6912": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e7f8fdd5c289
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=7168": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1c620ed90717
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=8192": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ca7f79c797da
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=48,hidden_size=9216": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b18ad9ef740f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7e5e16254b14
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..66b9016b80d6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=102400": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..41267d76bb7b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=102656": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8b1bdd081b38
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=11008": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1b86365ec8d8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a3605ec25c3e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=128": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..177e169046c1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..eefe22a77acf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=128000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6d4866d51277
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d25a6e558ef0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c13f346b0444
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..30fefd4cff6f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=1536": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d70b173c3ae9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=15360": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9686ba58b423
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=2048": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ded89a74da24
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=22016": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1bf7575e2a95
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=2304": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ff3b1a65cd88
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=24576": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..32b1dc19f9a0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5a103600df73
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=2560": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..862580187e9b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=27392": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..765b7fcf0597
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=2752": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dad3be5ede3c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=27648": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5e9de4977a0d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=3072": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..12c92e427c5d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cece35827652
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=32256": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..05a07685f4f8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=32512": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..19b832c59326
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..13256a11de49
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=33024": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..71ca4b92a1c6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7a02f9a5753f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=3456": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..36ce27a20c7c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=3584": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e473965d0d89
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..aa7ad780b5ed
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=4096": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d09de854e154
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=43264": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c600f9a4a564
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=4608": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f2fcd1b21602
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a0f8e37807d3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9fbe3a6da66c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=5120": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..60d5c4dc40be
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=5504": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a36eb605fa4c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=5632": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ad7b11ec6f27
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=6144": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..528a7abd6ffd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=6400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5e1779ffba43
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=64000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c89b534e12a5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=64256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7943d0cfde5e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..06705f2aa342
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=6912": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..21fcc4c55de4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=7168": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c7b2c16677ae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=8192": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..03d554a6d65e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=56,hidden_size=9216": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d0938a7af883
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..07bcdbf20094
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a3b3e1a40bd6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=102400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..64d9dace43e6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=102656": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5dfc981bc2f6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=11008": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bbefa2f6fdc6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=1152": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c6f10cf4989f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=128": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4663289ea195
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=1280": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4f50abb8852a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a3cc61173170
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6a6d73ea7f2a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=13824": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7ed41fea026e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..230edf27c64b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=1536": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cdd186cc4b9f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=15360": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4f76011c145a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=2048": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7798c41d0f06
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=22016": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..49d5d5dd3ac6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=2304": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9f700c6130b4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=24576": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4cc76fa38c81
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..76ee8a7d6102
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=2560": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..71b40355f9d8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=27392": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7cdad6a26049
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=2752": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a17c9772d2bb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=27648": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a6d8b40ac252
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=3072": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bc0b651ecd4c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=32000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c9c7dc3f4468
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=32256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6892c863631b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=32512": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e2f661dda26c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7fa15ee16716
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=33024": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..95868bcb7456
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4413b0d3675b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=3456": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5899404b9634
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=3584": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d89c80fb30b9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..76c2211485c3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=4096": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b6f398c0b076
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=43264": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..acec64f55cd6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=4608": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7163d4ab8c39
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0925d222a787
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..97ea276d1953
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d8383bc693fe
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=5504": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7922df8c0829
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=5632": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fa5851fd7502
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=6144": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bd05c1814c89
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=6400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..797f53636838
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=64000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3a1619a239c1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=64256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e953b5c67710
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..423d7b24f01d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=6912": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a7f10bd0d49f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=7168": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..435dadeca1af
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=8192": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f346f15c2e23
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=64,hidden_size=9216": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0c893f180a10
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1e4afde2e512
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..978cd7ab7325
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=102400": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c7bdb214bd4c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=102656": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6d6f21405c65
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=11008": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..80cf30cfef96
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f8103e769b18
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=128": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c3d44a73f35b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=1280": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cae90f69a184
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bba9830ad8b8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3107dffe8924
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a0e14bab1eaf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5373e616c435
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=1536": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f891ab67fecf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=15360": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b0a9bcbf635d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=2048": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..400122f387f5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=22016": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6b0e09332d14
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=2304": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7c718f80e1b4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..109cc3a80846
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..66199975e837
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=2560": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e8654beb8e5d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=27392": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7f2de73261ab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=2752": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5d07e0d228b6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=27648": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..08c31b8980d1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=3072": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3b025579accd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..70b96ee453df
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=32256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..844d7ed28415
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=32512": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..11201ec67bde
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e2ced87f84be
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=33024": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..00c493a9d303
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=3328": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..463c4846743a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=3456": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c9798336f74f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e6e44d69d91b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=36864": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bd730acc7051
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=4096": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e13c0d95b195
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=43264": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f1a4e393519c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=4608": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2ef93f449226
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0004a05e8a1e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dc50b17d249d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=5120": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..bf12510a5425
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=5504": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2ac3724620af
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=5632": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e4ab9dc8b8ba
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=6144": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8383761bc837
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=6400": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..097ee2ec6574
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=64000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6ec204c9c111
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=64256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..847e461c0323
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e46f913737b3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=6912": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..754a87f1bbe8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=7168": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..83cb91d60ca7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=8192": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7081cf4076bb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=72,hidden_size=9216": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..247e28d35bd5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ea4f2a102c02
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=10240": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8ec9a4f020e6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=102400": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d7e734b9dbfe
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=102656": {
+        "BLOCK_K": 512,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..daf1cb8ae2b0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=11008": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 128,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..002abc7662a5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..92c8b78468f9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=128": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d901fce7e335
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..093e42b6513f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4443e8cbbf33
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fc23807fa76e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ddbdd089d658
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=14336": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e35893b40242
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=1536": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..37acdda7a634
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=15360": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ab9441a7b3ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=2048": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..59c73ba026de
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=22016": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..002f8b5c8968
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=2304": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4ae4d6f4734c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4e9bf0aba106
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1d191860b79c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=2560": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..23761e39d7ad
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=27392": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cc645e813758
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=2752": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ae6226d1cb21
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=27648": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..88139aaaf02c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=3072": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3f05658cfb57
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4fbf55d8bb05
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=32256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4064cd359317
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=32512": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ce19b767a77d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d96efbb58943
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=33024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 16,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8f68f4280c29
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=3328": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ec977c3530ae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=3456": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7aa5634fcd48
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=3584": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b571892198ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=36864": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7bb972163a11
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=4096": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ae17c8ecf5e6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=43264": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d1f32242519d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=4608": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2a5dd4740ffb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=49152": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 64,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dbf21f5fa1e0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e920344f2420
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=5120": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..79940e1927b2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=5504": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8a35fdbbafd7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=5632": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c6e6a52180d3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=6144": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e9d33ae6f038
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=6400": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..be5e4ab7d032
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=64000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 64,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..711407b0620f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4f73d54aa992
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=6848": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8c21ae9405a5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=6912": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d5e3f555a677
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=7168": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..68faf2a604da
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=8192": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..29f03a383aca
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=8,hidden_size=9216": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 32,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3acf2172ddb3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..643a627ea0d4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=10240": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d99bce723687
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=102400": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..877a33b65222
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=102656": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..12ade6916fcb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=11008": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4dc0e71441f8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f21a68e8ee83
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=128": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5152f0b3ff4e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3953082729b0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=128000": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7524f10fca70
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=128256": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2064e9bd9b5b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f5348113634c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=14336": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0dfe95f6c31b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=1536": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 8,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6d25ff48d801
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=15360": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6f86b7098d3e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=2048": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ac0d8fe29ee7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=22016": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..eebb376f205b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=2304": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1e45d954518c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..702d10096436
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e89e84d4deed
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=2560": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..78683762005e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=27392": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0da685e67d6d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=2752": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..53017d8ee495
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=27648": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7c05ddd4194c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=3072": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ebd7f65eac4a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=32000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3541f0b9a3f0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=32256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..20d55b196608
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=32512": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..96210fec220f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..125b434ef45b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=33024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b04fbbdda9de
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=3328": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ce1b418a958c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=3456": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ea1c5a006c8f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..40ec1fab7bba
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=36864": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4c361c17fe59
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=4096": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0dd7c3d37d70
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=43264": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8723dad79e62
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=4608": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9415037ed7f7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=49152": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..13eb0b2756ae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..dc583685ab02
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=5120": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..23fb68ff8153
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=5504": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b085e5316888
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=5632": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..812297caf2a0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=6144": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..54d92a85b1f6
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=6400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..405c2e868728
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=64000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c66a7e5f0ccf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6747ed1a08c7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=6848": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..31651fc1faa2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=6912": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ed2fa7ce6d18
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=7168": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b5546e695dc9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=8192": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7c44a9ae81b9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=80,hidden_size=9216": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f39f6414abd3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=1024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..841902cc2ff0
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..f7aa2b2768cb
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=102400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..07e42e00e844
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=102656": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..830764450db5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=11008": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3a5efc527c8b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=1152": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..becb0e603976
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=128": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b44b8c3d8180
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=1280": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 16,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ddfcfb01c7b7
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6719e38fba98
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..0f52b8c9cc0f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=13824": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9cc14f02017d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=14336": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c50b3242921a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=1536": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7e98f105086a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=15360": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..aa6bee2870fc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=2048": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..21096640cb13
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=22016": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a50aa2e0363c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=2304": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c3364686564e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b7ff93819113
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8d10bf69031c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=2560": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..096b8d320b72
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=27392": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..684d38d2811c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=2752": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2bd01194d5ce
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=27648": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a7d00956f02c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=3072": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..886d8ea5f4ab
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=32000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d8e6b13dbe92
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=32256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c62e742fa961
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=32512": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e1d739d0a49d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=32768": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..601d128de45a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=33024": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5dd67e2690f9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b7d9fe07cfb5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=3456": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ca7f97699ec8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..005f4af2dd66
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=36864": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b97db9b80ccc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=4096": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fab5f7de4715
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=43264": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..78ca804fa3d1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=4608": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..524c1118598f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=49152": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..387b87a065a8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..34ce46ce03d2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=5120": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..148080894721
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=5504": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c75811679466
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=5632": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fdc0c3cfa0dd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=6144": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c86867594102
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=6400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8e4ca4b8d8a1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=64000": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..206e6e2d37e4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ab33b78848ec
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=6848": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e06af8cd8cfd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=6912": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..b775ea143b36
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=7168": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..9300b4bdb8f1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=8192": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3c604544d052
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=88,hidden_size=9216": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..068b851caa7f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=1024": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..378704ab28cf
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=10240": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..90174392ce3e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=102400": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..112850a0b030
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=102656": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..77f512f41fb9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=11008": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..1f3f1b604a4d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=1152": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3124fa86ce0a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=128": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 1,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fea9bf5bd3a5
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=1280": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..df330fd200a9
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=128000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3753f062cd04
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=128256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5aee58b4062e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=13824": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2db120babc1f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=14336": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..11a26c11166c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=1536": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 32,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..100067e93785
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=15360": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5d9db82e5288
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=2048": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..abc9c95e8b22
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=22016": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..66be2017f0ae
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=2304": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..45873b198e73
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=24576": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..ff32d7268f4e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=256": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c404185f47ef
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=2560": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..45dc37cd6c1e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=27392": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..354f9cda513e
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=2752": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3ce20813a940
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=27648": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..fcec90b796f8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=3072": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..06035f8733b3
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=32000": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c5c04329190b
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=32256": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..89bdb176ebc2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=32512": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..faab7a47840a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=32768": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..5b56d69c403f
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=33024": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..4b0e1c5badfa
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=3328": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..600943e1897d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=3456": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6690b75db842
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=3584": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..e584bc28dd7d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=36864": {
+        "BLOCK_K": 256,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..7b09b5d1a65c
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=4096": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..98d8cebcbf73
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=43264": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c63bbc7f882d
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=4608": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a36a9b36aa45
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=49152": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..d24898f0d4c4
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=512": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..56f8409d1aee
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=5120": {
+        "BLOCK_K": 32,
+        "SPLIT_K": 8,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..589965340a56
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=5504": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..cf5e15814824
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=5632": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..2034cf5ea634
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=6144": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..a2fb7b122395
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=6400": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..3128d88fe9bc
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=64000": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..751604b796cd
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=64256": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 2,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..c5d3fc706dc8
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=6848": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..26748fd2b2b1
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=6912": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..8b0abb8d3cb2
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=7168": {
+        "BLOCK_K": 128,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..46756b2b589a
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=8192": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 8
+    }
+}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 000000000000..6dd33a999d46
--- /dev/null
+++ b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
@@ -0,0 +1,7 @@
+{
+    "batchs=96,hidden_size=9216": {
+        "BLOCK_K": 64,
+        "SPLIT_K": 4,
+        "num_warps": 4
+    }
+}
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index fb8110b90564..f4e71cb110bb 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -35,7 +35,7 @@ def _get_op_configs(
                 f"batchs={batch},hidden_size={hidden_size}", None
             )
             return tuned_config
-
+    
     # If no optimized configuration is available, return None
     return None
 
@@ -45,7 +45,6 @@ def _get_default_config(op_type: str, batch: int, hidden_size: int):
         return {"BLOCK_N": 256, "SPLIT_N": 8, "num_warps": 8}
     else:
         return {"BLOCK_K": 32, "SPLIT_K": 64, "num_warps": 8}
-    # raise NotImplementedError
 
 
 def get_lora_op_configs(

From e7bda61b8181ef3dec0f123fa07d7ae92aeef639 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 13 Jun 2024 14:40:26 +0800
Subject: [PATCH 21/71] delete config

---
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...en_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...idden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...den_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 ...dden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json | 7 -------
 1150 files changed, 8050 deletions(-)
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
 delete mode 100644 vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json

diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3b39ab85d9b3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=1024": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 926c453330ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=10240": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3403b6a8a156..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=102400": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 00a40f0fb282..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=102656": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fe2ef151f545..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=11008": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e72812a699b8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=1152": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6af79154d137..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=128": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ede75bf5ee4a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=1280": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7c18b5d9e89a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=128000": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 74123059d34c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=128256": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e6204367ba9f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=13824": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 619d49755fbd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=14336": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index da168958d44b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=1536": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b774e5e73509..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=15360": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5df0d12a0066..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=2048": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e669eec80db8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=22016": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6f248613276a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=2304": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4b800fceca15..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=24576": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 97ef5bd49850..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=256": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 498985dfa565..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=2560": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 888779c1a242..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=27392": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 90c40f66516a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=2752": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 725987ef135d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=27648": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6758c49d6d53..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=3072": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 739073148751..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=32000": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3ba9089734f5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=32256": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 74d73ee28866..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=32512": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cd011852520e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=32768": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bdb74bae1096..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=33024": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 90067aae86ea..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=3328": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 266f0b4643a2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=3456": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 50eef61c7dc0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=3584": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1a0e6bad928a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=36864": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6379489182c5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=4096": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 845e90469c7a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=43264": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6da3f2cdd17e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=4608": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5859f692c3b2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=49152": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a1ed01126386..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 20e62377ef27..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=5120": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cfd9a3f149ff..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=5504": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7e7ff82dc5f3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=5632": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6312c21225d5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=6144": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7bda71f1c3e4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=6400": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c39485cbc08c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=64000": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b1adfbe01c2c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=64256": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0b1aee061aa7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=6848": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3a894b412e3f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=6912": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0132c4375421..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=7168": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9ffe008aa83e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=8192": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 838189dba35d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=9216": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5b1da4d44b94..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=1024": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c392909217f5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=10240": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2e160c4ae390..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=102400": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 46428cc0a9da..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=102656": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 34ff5ebb9fe7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=11008": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1375324c09ad..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=1152": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e8b0e9dbe8b0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=128": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 72fc4afd1efe..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=1280": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 97e7d9e7bd0d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=128000": {
-        "BLOCK_N": 1024,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 40a4a9526be0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=128256": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ac35eea6297a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=13824": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 38b1819b0120..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=14336": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 668669e9fb4a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=1536": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c23e4b555ab4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=15360": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 30715168cdd8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=2048": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 050f3384e1cf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=22016": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3820959d0032..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=2304": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 08d8f70e1e7a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=24576": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5cf06550f0b1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=256": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5ee401212495..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=2560": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1d7db0c6a860..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=27392": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 77fc2358208c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=2752": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3b33817c6ecb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=27648": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d62a622342b5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=3072": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c7030ad5a673..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=32000": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ff76f3c110b9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=32256": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 721b587a948d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=32512": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 92932b62f1a1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=32768": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7cf1394d96bd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=33024": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bec43f2e9cd1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=3328": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8dfd12024faa..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=3456": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c9fa0757f4d2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=3584": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2323a50dfb84..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=36864": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 41e170807720..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=4096": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b04da877902c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=43264": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0df3ef025f97..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=4608": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 07e41596ed86..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=49152": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9013302be01a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 520a85f2e70a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=5120": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 11a3940a9d4a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=5504": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 43f4baa91a71..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=5632": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2a5260ec1d4d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=6144": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 962399539ec2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=6400": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cbc8f93ce329..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=64000": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 95e76f479321..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=64256": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b967d91645ed..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=6848": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6c2f971176df..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=6912": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e0203c01009e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=7168": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0db797564e0d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=8192": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ab2faa8a3e47..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=9216": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 99d36f108d24..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=1024": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5d1797c7df6a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=10240": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3c5a379e0bdc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=102400": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 16f2497bed72..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=102656": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index de9477263adf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=11008": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 58b67d1eb450..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=1152": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3b20b74c6b65..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=128": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 711133cabf41..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=1280": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8ae36e752fa0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=128000": {
-        "BLOCK_N": 1024,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9bd2e640b741..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=128256": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e9ef08e28930..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=13824": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 02df6114edd3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=14336": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f174cccf6781..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=1536": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 95398b063d5a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=15360": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a9eb1222067d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=2048": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b0376ebedcf0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=22016": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 33a78ee55501..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=2304": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5d62f3bc77ad..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=24576": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bb04a656ec58..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=256": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 76589c2e4848..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=2560": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c4f0e1fe7d02..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=27392": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4e3e51669b1a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=2752": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 76507320d8c9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=27648": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 95aefbac204c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=3072": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 804c9a7df946..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=32000": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8c4069b4c997..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=32256": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0de6a013f40e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=32512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fbc4f954e962..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=32768": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 89dff508fcff..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=33024": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fb48ebaa8f79..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=3328": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7603143e1ff2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=3456": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f119d16aa81c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=3584": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b19e8f096df7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=36864": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e870cd0967a0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=4096": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b30bd7d45e40..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=43264": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 29ab8038c085..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=4608": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 39faabbdede5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=49152": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 87cad481bbef..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ec7d7b1e6d69..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=5120": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 779014c6a48d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=5504": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e4e1610914d4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=5632": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 91e760ed29e7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=6144": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d1dba65beeb5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=6400": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 63cd02986d0e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=64000": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 53bd1e2a033d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=64256": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ceaea00d1ad4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=6848": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f7fe5d732ec5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=6912": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d026e12311a7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=7168": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1ce097bb563d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=8192": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f22a1513a6a4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=9216": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dc055ce5a023..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=1024": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fec5aa8a43a5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=10240": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f76e21dcf101..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=102400": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 02e01a88f229..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=102656": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ca13ab17631b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=11008": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 221b7046a42d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=1152": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 04e2fee606e4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=128": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f3e8e4c95080..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=1280": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 92d8dc48ef21..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=128000": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index deddd170e828..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=128256": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d8dba1076582..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=13824": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 585c1d33ce0d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=14336": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3a2668c5d3fd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=1536": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ac9189fdf242..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=15360": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9cc54aa3ceae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=2048": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a49ce276482b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=22016": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b66638610a4a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=2304": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6a6f0780ee68..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=24576": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 64cdc015d4f5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=256": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d768b5944d32..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=2560": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e1af0bc7d4a2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=27392": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index de80b48b9e46..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=2752": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2f1cc53d9d07..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=27648": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5399d322bca2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=3072": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ebe63936e73e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=32000": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d0af75521d17..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=32256": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ecb601665b16..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=32512": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0a95531a6226..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=32768": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e927a860d646..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=33024": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 64757e015c63..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=3328": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index de0faf408e1f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=3456": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5c93deb397ab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=3584": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3ce9a9150319..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=36864": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 82f6e893c6ad..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=4096": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e21f857efe75..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=43264": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b41625baab6a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=4608": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9ce03febee07..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=49152": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index efecf1f371f9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b1a5db564eba..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=5120": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d105bc53b555..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=5504": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e042d730fde0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=5632": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 005e8480d530..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=6144": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b0ce387d16a7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=6400": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fe8d2347f8be..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=64000": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 19016d92afc8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=64256": {
-        "BLOCK_N": 1024,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c24e63d4b061..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=6848": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 621e45f3d647..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=6912": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1764f42a3690..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=7168": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2c005c34c3b5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=8192": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7a560bf5b977..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=9216": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0fb0abbfd93b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=1024": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5089debbfefd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=10240": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 879a1d36c817..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=102400": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index eda15de61763..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=102656": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 10b2451c5f88..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=11008": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3ef065e97426..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=1152": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 26ecc87645eb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=128": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cea687cffe03..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=1280": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 50b07329f967..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=128000": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c9bc76d4e02d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=128256": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5e2bd08a4728..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=13824": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 95c19b2bafe1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=14336": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2a3648ab54f4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=1536": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d384ecbf3556..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=15360": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 017eb531ae10..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=2048": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 841202546c13..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=22016": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 78e4c0323585..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=2304": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1ac24aae29fe..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=24576": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8422904ffb26..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=256": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8a0eb3dd15c3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=2560": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0e433e6bed3d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=27392": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d6adf2ceb6b6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=2752": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bca14db36270..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=27648": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 985d62438445..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=3072": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e8464e7008ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=32000": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a47ee1a7459e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=32256": {
-        "BLOCK_N": 1024,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 889bbdc9e7c8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=32512": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7f722c7cc4f2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=32768": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ee0493b17a74..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=33024": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 552f181d0e3a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=3328": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d2fb833b52b7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=3456": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6fd72f2cb7df..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=3584": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b566409ba7bc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=36864": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 02781cc3c3a3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=4096": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5eb93d119fd2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=43264": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 691bf7cf400d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=4608": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e6f417f1ce35..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=49152": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ecc4311c65c6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2f8e3bad9359..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=5120": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 28bb43c5cea0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=5504": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ffd213467c8b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=5632": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 43dd6d5c4f34..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=6144": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a73329612467..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=6400": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1537688252ba..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=64000": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bdd7de7b0544..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=64256": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 537e8a289957..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=6848": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5822a67015d7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=6912": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 091e7c378078..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=7168": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d0c8d9d533a7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=8192": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9b047851381d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=9216": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ba9a5daa8327..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=1024": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 49bcf2f569f7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=10240": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6285daf17f71..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=102400": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bd9ce93f682f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=102656": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 939967371660..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=11008": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 798594bfd3a5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=1152": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3fa0c0edbdfe..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=128": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 531e3e4accaa..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=1280": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9d597ca3ab45..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=128000": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7826d6de2043..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=128256": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4382dfac1232..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=13824": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0dc25b7a9c47..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=14336": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2cb628ac30d9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=1536": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d3ade6322fcf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=15360": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f743a190ff6c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=2048": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 13dc549b58a4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=22016": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dd31e03333ef..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=2304": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f873a2168d70..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=24576": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f654b1763c6b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=256": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c3d6c38da9ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=2560": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6ebeee44d74f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=27392": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4583c1bd2a74..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=2752": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 457cba2bb27a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=27648": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6c60fd9cf325..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=3072": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 12dff65ef5e3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=32000": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7e6bcdc82b12..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=32256": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5cef4c0639e2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=32512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d475b36f7b10..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=32768": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 62bd24b55325..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=33024": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b9d49f65f25c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=3328": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f13c7ea9fcdb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=3456": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3c8bb10faf54..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=3584": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index eeeed1d55f4c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=36864": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c1780da9065d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=4096": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 73265ea43e99..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=43264": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 95ef324ce999..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=4608": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a2861173e71e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=49152": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 03789328aa67..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2ecae2ab22ab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=5120": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7979c4049101..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=5504": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e11fdff5cf8b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=5632": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b5d4291484ac..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=6144": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 510e5bcdd8f9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=6400": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a7d9938f211d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=64000": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4b4d5715c4a6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=64256": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bdc940e4306d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=6848": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6ac8e567768d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=6912": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a5a8025c74e2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=7168": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4937a00c96b9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=8192": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 14d37e0e84da..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=9216": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0655aeaf04d4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=1024": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4da43af96a88..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=10240": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 02944f10112d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=102400": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 888cbee83cd0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=102656": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9b48040ae35d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=11008": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 17628098a876..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=1152": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ba77dfd4e745..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=128": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index afc038f82824..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=1280": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cfca3795cf0c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=128000": {
-        "BLOCK_N": 512,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 40205831e8c5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=128256": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 73480c2a2fb0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=13824": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0ba68ff88dd7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=14336": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1411944ed903..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=1536": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 52d4d6d866da..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=15360": {
-        "BLOCK_N": 1024,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5e8c8b03d807..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=2048": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ea2e2b703621..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=22016": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e07a33a9f890..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=2304": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1cd26dfc178c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=24576": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2101c81521e4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=256": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c6545c96c672..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=2560": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dee776ee0b6b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=27392": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b34f648169cf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=2752": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b90111549674..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=27648": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a07aa0c23f3e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=3072": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 70c8889ecf4f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=32000": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9050b5588db6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=32256": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 64a0a899f92c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=32512": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3fc62e336640..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=32768": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8bf15baf2d0e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=33024": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3b72a219f413..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=3328": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 555f04b52080..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=3456": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f5e814a48de6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=3584": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index aed09238da26..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=36864": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e32fcb1ffb63..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=4096": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 120a7ac01e61..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=43264": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3580fa47a62f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=4608": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5b7de710ac80..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=49152": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c25b01a3a2ee..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=512": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 39e20cdd3dc4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=5120": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6103c0f82883..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=5504": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 534348364229..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=5632": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 29cdb7e2b43b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=6144": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e5362a24e683..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=6400": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6f5a25fef3b2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=64000": {
-        "BLOCK_N": 256,
-        "SPLIT_N": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5cd46fba6793..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=64256": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7afa26abf9ed..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=6848": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c2fbb625f0cb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=6912": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 182c287a971c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=7168": {
-        "BLOCK_N": 64,
-        "SPLIT_N": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 543cfc5cf252..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=8192": {
-        "BLOCK_N": 128,
-        "SPLIT_N": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 209fd07020ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=expand,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=9216": {
-        "BLOCK_N": 32,
-        "SPLIT_N": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e7807642e242..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d990a464aead..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=10240": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f8ba6f98e11d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=102400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2820c3819fb1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=102656": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8ff38bafd95b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=11008": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2239222c6721..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 76ebd9a6187b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=128": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 604d0f4a24f6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7b9522e223c9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 16ca324abb1d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 51fc783ba5be..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=13824": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2b5c9fb008a5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=14336": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1d6fdcc9e4cf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=1536": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0188952eabb5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=15360": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 748317facd1a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=2048": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 762c6902553f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=22016": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 50693dd027e9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=2304": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ce7f797b6501..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1fb7817d045a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 93cb002287b9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=2560": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index df245f7e4e3a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=27392": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c420bc28686a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=2752": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ec8253156d69..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=27648": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8eb952a2f70b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=3072": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5f4e26b927c4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6567393424e9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=32256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0de485aa91ed..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=32512": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 382626229f94..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=32768": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f339e51540b0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=33024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1c29335eaf49..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 164746c525a9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=3456": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 32f4a2527a5f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bf9fccfd9628..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=36864": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5afaf9370a75..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=4096": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 434e65d6010f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=43264": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5b05b7d563cb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=4608": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6ed84d41e515..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 57f7208d5b31..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a39eee775813..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 386dcefdd9ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=5504": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 404b2e3e7143..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=5632": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f0f167c7f637..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=6144": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d5caa9380998..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=6400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 03c28033d268..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=64000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e5055c7016d7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=64256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 64efb8751002..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=6848": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 090a44e33153..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=6912": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 47231de840f8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=7168": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d0b6943c5df2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=8192": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bcfe34389c8e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=1,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=1,hidden_size=9216": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9f2c62aa7bf3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f205785ac354..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3b1ae9af607c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=102400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 21f6c7c4d2c1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=102656": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 80432fa22304..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=11008": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 431749d79e07..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=1152": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e4cf65f2b466..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=128": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 456cdd0cbfa7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 657d4601e38d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3fdc882f8d4d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f16d8e73b04f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cdb9f921d9e2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a4a2728ea3f7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=1536": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e250ad59c685..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=15360": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1fe78e780ab8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=2048": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ea5d35272955..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=22016": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c24d3ebad8f6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=2304": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 60d16b61c097..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1ac220f8cdb4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 593b6236bedc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=2560": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e02f41487062..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=27392": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0e21dad2a1bb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=2752": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e7de2732db5d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=27648": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4594326f1214..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=3072": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 476b94614e61..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=32000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3394e0b2b2e2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=32256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 47b57eaba2d9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=32512": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2095dc93481f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=32768": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d7ae5539e292..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=33024": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 392502ed9bef..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=3328": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index caaf6dd953b3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=3456": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 34fa2b5fc43e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 71e95b2a3456..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1506da8d5b06..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=4096": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 82dd0f4c7d44..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=43264": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f97d81fa1054..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=4608": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 985c8781d3a0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7ca73c314f25..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1a63f1a720a0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=5120": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 022bfd82a54b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=5504": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9b22d0042659..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=5632": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4075a65a0a7c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=6144": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index eabfa5752d0a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=6400": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4c12481b8079..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=64000": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e30688c4abd2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2e29f14a92cd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 13ff36677840..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=6912": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8539aa72ab90..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=7168": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 53b66c493ea1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=8192": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a6dfe596884f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=104,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=104,hidden_size=9216": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 602ffc66510d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fafea8e3786d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=10240": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1eec6dc77df8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=102400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dedb0618406f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=102656": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index de1dcf78c395..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=11008": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 73bc95410c42..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c7834800f885..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=128": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 65868fe3b2f4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6c18618b318f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=128000": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a036606de7cf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 89df4c522fae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=13824": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a142ada1ad07..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=14336": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9bb33d368fef..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=1536": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3a988a776e6f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=15360": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a57d196ba0ae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=2048": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c6aa7bf7b99b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=22016": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 83c7c4e4c9a2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=2304": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 427c0106c6ab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=24576": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d47ff5bbb529..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6f1afccc1299..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=2560": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 61a1b944f16c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=27392": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6423b56c688d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=2752": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index aa5170de0248..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=27648": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ae0f48e917f2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=3072": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6897b5908a5b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=32000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4c40c7bfc193..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=32256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8b38bafedc79..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=32512": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 81d3642fe4ec..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d5263b43cc15..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=33024": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 35d23d34e4a7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d72e173b284f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=3456": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a6387b5fc49e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=3584": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d80742511305..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b27c32ff573f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=4096": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index af6c84968dea..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=43264": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6da56b5f85f0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=4608": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index aa2f1263faae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f3b5f24c791b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dde06af0de34..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2dc362e1e1d4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=5504": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 76853f696c19..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=5632": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d498238ea795..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=6144": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fc10d7616de9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=6400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 60aed7116c3d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=64000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 98bccfc678a1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 71bf22fc018b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0cbc42cfc86f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=6912": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ebab6f621840..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=7168": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7f2582d5fabb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=8192": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 37a66a2651be..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=16,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=16,hidden_size=9216": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cb914f7d052d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3cbfe81a8241..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 01318faf6258..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=102400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7201e38109c1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=102656": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d36bf9a65f9e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=11008": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 01a390c790a7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 068bf8ed0f4e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=128": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7172561ad91a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c39444441044..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0fe8be7dd9e4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=128256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 463a4228b8b6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1c566ee67de8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=14336": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bd612e59861b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=1536": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c6a18389e22b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=15360": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index de611d52747f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=2048": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7692a7d496a3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=22016": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 67533819f2b7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=2304": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e7df88ce3d48..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=24576": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d60a540f6c9f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d4f9c110fc67..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=2560": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index db4554d192bc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=27392": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e468805dd6bf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=2752": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ffbb85f3cc9d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=27648": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 77046010f9f9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=3072": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 101a1fde2e17..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 97ecc56f5a4b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=32256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f1f3fa79859c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=32512": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f26ad0121efa..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 08f2719e2a70..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=33024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 19650dbe5c76..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 43b03a77fccf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=3456": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5ea0fd9b7fb1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 97e4bf17acb5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=36864": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b201e52f3f1a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=4096": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d79ac16a16e0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=43264": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9d1bc041bfa0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=4608": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index af3fbc6aa009..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=49152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3e6ab8e331e8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3824f7dc6657..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=5120": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d1fd2e96658a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=5504": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d9642bf1e5bf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=5632": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 89e5b64ce96b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=6144": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2345b7e5e882..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=6400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6de6f1c12486..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=64000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b6c138a55f28..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4a63ed737b7a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ee7b427fbae6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=6912": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 345cb9192105..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=7168": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 91dabeb15527..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=8192": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b90e9970c8c0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=2,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=2,hidden_size=9216": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d750dbc94ebf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 422be89522eb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=10240": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a43fa1f140e9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=102400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 89e3dac0a31e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=102656": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8c75da6742a4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=11008": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 81d9eeae3f90..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 32c16964bada..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=128": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cd66a4abdfdd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6e95d75f0b7a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6ffa41368045..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=128256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0c993ab8ef41..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=13824": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4bef43ff2a51..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 382f702b15f5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=1536": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1b6fb8d0262c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=15360": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9f9ab7d1da6f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=2048": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2ca6015cc8e2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=22016": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c01c77b849ad..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=2304": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1ffc10aab8a9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=24576": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9c90c8e5d3d6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a4a5aff5f9bf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=2560": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bfad4944a155..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=27392": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f377b9487847..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=2752": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 294a45493519..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=27648": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5dd926e91002..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=3072": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7b41f769bfea..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2f376b9d3d99..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=32256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5a03b6ba2242..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=32512": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a093a839dd4e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 904257e85be9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=33024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4f41954d2c44..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=3328": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7b430bb3ef84..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=3456": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5545e8a9bb58..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=3584": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 76e30d5c6721..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=36864": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 60adb9b594de..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=4096": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 70fac02be5f4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=43264": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fac51807dd14..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=4608": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f8af00a3f0b2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=49152": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ec0716115b72..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5bfce900dd59..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 142a66db84ff..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=5504": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1e819ccf88b9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=5632": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 640dfe5c4aab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=6144": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3f9f49c856ae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=6400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ee67fafb3f47..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=64000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a4c7d1951a2d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=64256": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2668afddbf9d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 432191c74e6e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=6912": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 965794c29584..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=7168": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4099c8b31019..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=8192": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 54f120427642..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=24,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=24,hidden_size=9216": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bc0d77a48b76..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4d245dd8606c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=10240": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 15f2726ffb7f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=102400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 610e795d0889..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=102656": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 86f0bd139844..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=11008": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 525db2cd9c83..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f154c1ac63ae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=128": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4d9ca99f621b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4ce6fea19d94..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 61b5a2e19b2a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=128256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ee27b7eb7145..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=13824": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f1c0786c18a5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2e80cd18b7e3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=1536": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a7ebd2698a89..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=15360": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a70cbb63546d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=2048": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 411a6d693f37..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=22016": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7bf4485bdafb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=2304": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4b83542ed831..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fcdf01c82b6e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 27fa6e4ab908..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=2560": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f32c6f35e55b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=27392": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 97e139922a3b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=2752": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 633e173ede3d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=27648": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4b2853e27e6d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=3072": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dbca49ebf47f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bbbb809b4bac..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=32256": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2db20b0f523a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=32512": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b76c39dbc79c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=32768": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 95de274c392b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=33024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 31cc0f0988df..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=3328": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d5848f9c0dc2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=3456": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 928c4793a1b4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=3584": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 87333a2977e6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a13f1ae708cc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=4096": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e263c22b48c8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=43264": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 28ccb7922928..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=4608": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 74bdc063f829..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=49152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e65f07e0c1b7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 86b22822e193..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 068d658420a2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=5504": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bb22b22c0cf4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=5632": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 693e8b466480..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=6144": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 72e213c9c841..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=6400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8983a00a5dc0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=64000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7ebc1a433047..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=64256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index baf8710e7904..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=6848": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 83d82eab352e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=6912": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ea365d60b5f7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=7168": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e0e266cba149..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=8192": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index df672f000e51..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=32,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=32,hidden_size=9216": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 21b7f37b6f10..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=1024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6780bd1b6061..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c8d5d48f3067..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=102400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8092de1d055e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=102656": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 25352c1f541a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=11008": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 72a9af0df017..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=1152": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8b8a4d4ba0a6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=128": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5a63ccf5ea27..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=1280": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f5322f696374..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=128000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9c0b7751d2a2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3645b8d9bc6e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f8e1776dbeac..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 67a3fb9e41c7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=1536": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d145bf3a8058..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=15360": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 17bbd1c727ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=2048": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 50bb33d9ddc2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=22016": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 28de03a38564..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=2304": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e8696121d871..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f24f53814e53..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 414e8136350b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=2560": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d0a7f1ad50ca..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=27392": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 90b14d89c238..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=2752": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dfb4a2036591..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=27648": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index afcc971891fd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=3072": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 86627bb4c2b6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 256,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 18f2e8d01075..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=32256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 60216da8b12c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=32512": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3f3d7c814ec2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=32768": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 11f41c734aed..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=33024": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b7b78e643da8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dd301c41083a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=3456": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 00e16062556b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 90b71d6a48aa..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4567fabf9fa3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=4096": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d758bd2bd5d4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=43264": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4dd89a2baa91..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=4608": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4e2ea6dc099c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 571e5f11fb3a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 72fbed4df169..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cdebe9fa2d4d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=5504": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b3315ac27857..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=5632": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0f58f063add0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=6144": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 36435a35bd1f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=6400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a3d53ddfb20c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=64000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9ec3fad337d0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 48399c1da49f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 90a1a1dc7123..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=6912": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 490f4fe0958a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=7168": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7dfbd884de1e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=8192": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 423a0a8bb660..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=4,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=4,hidden_size=9216": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a23caa15d938..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=1024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index da975d0f7f04..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 765e06971801..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=102400": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ad02b596b368..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=102656": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6ea35e6b4067..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=11008": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3c5b43290284..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d1d0c2952eec..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=128": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7ac3b09650e6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7e364a12309c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 362a4794b89d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6daf2db30092..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d39676778929..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ca1458693a59..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=1536": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5caff317b920..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=15360": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bd333dbdea1b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=2048": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5b17698cf941..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=22016": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8c34a9ffe746..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=2304": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 628d5e31f820..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=24576": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 53510a4a3176..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 36e4a63d0806..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=2560": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e95f7c421e3c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=27392": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e9fde38aa4c1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=2752": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 675598e6916c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=27648": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d0c7c62a69f4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=3072": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d38a57c5cb63..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=32000": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fbf651ee7ab1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=32256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f56745eaf5d9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=32512": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c7e6acf8da4c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=32768": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e214f20d25dd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=33024": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c3a7b1b3d075..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 70c8b1a5cebe..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=3456": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7518d2514d3f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=3584": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8b8caf903283..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 24dfdeb31e27..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=4096": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 73974a4b5ec4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=43264": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 134f097ee092..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=4608": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 99fd65e94e97..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 256,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a085010d32df..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 50fa49ac53a0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3444417e1cc2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=5504": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 15f14f8a9a1d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=5632": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1c05dce95c3a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=6144": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 56833977b005..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=6400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bfa3f47fa2eb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=64000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4991337f39f4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ad9494f2d0ef..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 72ffcbf1889d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=6912": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c5ae6958ebdf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=7168": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bdd5edbfc87b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=8192": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 610d6175f27e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=40,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=40,hidden_size=9216": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2548ac5d500e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 987dc9b61dc2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 848a752c0379..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=102400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index faa00296ce21..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=102656": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f280fe287d2b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=11008": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 19ddc1d36abf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a7f53ede4c6d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=128": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d3c33217c882..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a8fb320c7b01..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=128000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e0a7c154a14f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=128256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c9b76cf2ea95..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=13824": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1fcf12b7eb96..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c335578c3dab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=1536": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b9f810c13912..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=15360": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 215df482ca64..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=2048": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 86e68481e14c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=22016": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ee1aa6e358cf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=2304": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f00777cd606a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0382b4bfcad6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0cd5ac59ca34..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=2560": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3e5825452ae7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=27392": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e9ebb39aaa52..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=2752": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0951d6150b20..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=27648": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4f6ec8b3fcfc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=3072": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0dc115f3588e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=32000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 55b7954dd82a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=32256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 786022725aab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=32512": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a80125409b55..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c2c745a5b6ab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=33024": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e8803b5163ed..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=3328": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d3ee96ee2125..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=3456": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0ba06679888a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index db6774f5f21c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a7ffc7684eaf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=4096": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6f775a7eaa65..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=43264": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 29275d15516d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=4608": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c1f3159e79ad..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=49152": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 43d2c867eb78..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3ef158578c25..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2fa3f9b2dd5a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=5504": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3b7df5022eed..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=5632": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5150a9505d59..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=6144": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 15bd83f652b2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=6400": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index be3441337184..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=64000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5adbe1354608..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4d0c8340f94e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=6848": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f25843901a29..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=6912": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e7f8fdd5c289..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=7168": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1c620ed90717..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=8192": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ca7f79c797da..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=48,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=48,hidden_size=9216": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b18ad9ef740f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7e5e16254b14..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 66b9016b80d6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=102400": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 41267d76bb7b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=102656": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8b1bdd081b38..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=11008": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1b86365ec8d8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a3605ec25c3e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=128": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 177e169046c1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index eefe22a77acf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=128000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6d4866d51277..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d25a6e558ef0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c13f346b0444..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 30fefd4cff6f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=1536": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d70b173c3ae9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=15360": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9686ba58b423..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=2048": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ded89a74da24..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=22016": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1bf7575e2a95..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=2304": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ff3b1a65cd88..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=24576": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 32b1dc19f9a0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5a103600df73..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=2560": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 862580187e9b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=27392": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 765b7fcf0597..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=2752": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dad3be5ede3c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=27648": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5e9de4977a0d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=3072": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 12c92e427c5d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cece35827652..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=32256": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 05a07685f4f8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=32512": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 19b832c59326..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 13256a11de49..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=33024": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 71ca4b92a1c6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7a02f9a5753f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=3456": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 36ce27a20c7c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=3584": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e473965d0d89..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index aa7ad780b5ed..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=4096": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d09de854e154..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=43264": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c600f9a4a564..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=4608": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f2fcd1b21602..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a0f8e37807d3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9fbe3a6da66c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=5120": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 60d5c4dc40be..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=5504": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a36eb605fa4c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=5632": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ad7b11ec6f27..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=6144": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 528a7abd6ffd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=6400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5e1779ffba43..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=64000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c89b534e12a5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=64256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7943d0cfde5e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 06705f2aa342..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=6912": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 21fcc4c55de4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=7168": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c7b2c16677ae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=8192": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 03d554a6d65e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=56,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=56,hidden_size=9216": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d0938a7af883..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 07bcdbf20094..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a3b3e1a40bd6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=102400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 64d9dace43e6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=102656": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5dfc981bc2f6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=11008": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bbefa2f6fdc6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=1152": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c6f10cf4989f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=128": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4663289ea195..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=1280": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4f50abb8852a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a3cc61173170..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6a6d73ea7f2a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=13824": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7ed41fea026e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 230edf27c64b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=1536": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cdd186cc4b9f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=15360": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4f76011c145a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=2048": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7798c41d0f06..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=22016": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 49d5d5dd3ac6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=2304": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9f700c6130b4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=24576": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4cc76fa38c81..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 76ee8a7d6102..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=2560": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 71b40355f9d8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=27392": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7cdad6a26049..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=2752": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a17c9772d2bb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=27648": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a6d8b40ac252..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=3072": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bc0b651ecd4c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=32000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c9c7dc3f4468..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=32256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6892c863631b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=32512": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e2f661dda26c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7fa15ee16716..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=33024": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 95868bcb7456..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4413b0d3675b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=3456": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5899404b9634..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=3584": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d89c80fb30b9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 76c2211485c3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=4096": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b6f398c0b076..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=43264": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index acec64f55cd6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=4608": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7163d4ab8c39..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0925d222a787..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 97ea276d1953..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d8383bc693fe..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=5504": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7922df8c0829..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=5632": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fa5851fd7502..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=6144": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bd05c1814c89..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=6400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 797f53636838..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=64000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3a1619a239c1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=64256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e953b5c67710..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 423d7b24f01d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=6912": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a7f10bd0d49f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=7168": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 435dadeca1af..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=8192": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f346f15c2e23..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=64,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=64,hidden_size=9216": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0c893f180a10..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1e4afde2e512..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 978cd7ab7325..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=102400": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c7bdb214bd4c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=102656": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6d6f21405c65..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=11008": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 80cf30cfef96..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f8103e769b18..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=128": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c3d44a73f35b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=1280": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cae90f69a184..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bba9830ad8b8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3107dffe8924..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a0e14bab1eaf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5373e616c435..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=1536": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f891ab67fecf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=15360": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b0a9bcbf635d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=2048": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 400122f387f5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=22016": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6b0e09332d14..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=2304": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7c718f80e1b4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 109cc3a80846..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 66199975e837..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=2560": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e8654beb8e5d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=27392": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7f2de73261ab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=2752": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5d07e0d228b6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=27648": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 08c31b8980d1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=3072": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3b025579accd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 70b96ee453df..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=32256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 844d7ed28415..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=32512": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 11201ec67bde..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e2ced87f84be..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=33024": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 00c493a9d303..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=3328": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 463c4846743a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=3456": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c9798336f74f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e6e44d69d91b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=36864": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bd730acc7051..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=4096": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e13c0d95b195..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=43264": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f1a4e393519c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=4608": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2ef93f449226..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0004a05e8a1e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dc50b17d249d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=5120": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index bf12510a5425..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=5504": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2ac3724620af..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=5632": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e4ab9dc8b8ba..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=6144": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8383761bc837..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=6400": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 097ee2ec6574..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=64000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6ec204c9c111..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=64256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 847e461c0323..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e46f913737b3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=6912": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 754a87f1bbe8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=7168": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 83cb91d60ca7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=8192": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7081cf4076bb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=72,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=72,hidden_size=9216": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 247e28d35bd5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ea4f2a102c02..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=10240": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8ec9a4f020e6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=102400": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d7e734b9dbfe..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=102656": {
-        "BLOCK_K": 512,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index daf1cb8ae2b0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=11008": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 128,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 002abc7662a5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 92c8b78468f9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=128": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d901fce7e335..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 093e42b6513f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4443e8cbbf33..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fc23807fa76e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ddbdd089d658..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=14336": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e35893b40242..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=1536": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 37acdda7a634..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=15360": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ab9441a7b3ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=2048": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 59c73ba026de..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=22016": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 002f8b5c8968..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=2304": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4ae4d6f4734c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4e9bf0aba106..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1d191860b79c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=2560": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 23761e39d7ad..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=27392": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cc645e813758..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=2752": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ae6226d1cb21..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=27648": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 88139aaaf02c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=3072": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3f05658cfb57..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4fbf55d8bb05..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=32256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4064cd359317..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=32512": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ce19b767a77d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d96efbb58943..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=33024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 16,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8f68f4280c29..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=3328": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ec977c3530ae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=3456": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7aa5634fcd48..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=3584": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b571892198ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=36864": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7bb972163a11..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=4096": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ae17c8ecf5e6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=43264": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d1f32242519d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=4608": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2a5dd4740ffb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=49152": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 64,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dbf21f5fa1e0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e920344f2420..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=5120": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 79940e1927b2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=5504": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8a35fdbbafd7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=5632": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c6e6a52180d3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=6144": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e9d33ae6f038..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=6400": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index be5e4ab7d032..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=64000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 64,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 711407b0620f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4f73d54aa992..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=6848": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8c21ae9405a5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=6912": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d5e3f555a677..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=7168": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 68faf2a604da..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=8192": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 29f03a383aca..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=8,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=8,hidden_size=9216": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 32,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3acf2172ddb3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 643a627ea0d4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=10240": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d99bce723687..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=102400": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 877a33b65222..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=102656": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 12ade6916fcb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=11008": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4dc0e71441f8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f21a68e8ee83..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=128": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5152f0b3ff4e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3953082729b0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=128000": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7524f10fca70..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=128256": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2064e9bd9b5b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f5348113634c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=14336": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0dfe95f6c31b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=1536": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 8,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6d25ff48d801..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=15360": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6f86b7098d3e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=2048": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ac0d8fe29ee7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=22016": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index eebb376f205b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=2304": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1e45d954518c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 702d10096436..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e89e84d4deed..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=2560": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 78683762005e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=27392": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0da685e67d6d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=2752": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 53017d8ee495..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=27648": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7c05ddd4194c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=3072": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ebd7f65eac4a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=32000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3541f0b9a3f0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=32256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 20d55b196608..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=32512": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 96210fec220f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 125b434ef45b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=33024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b04fbbdda9de..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=3328": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ce1b418a958c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=3456": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ea1c5a006c8f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 40ec1fab7bba..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=36864": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4c361c17fe59..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=4096": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0dd7c3d37d70..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=43264": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8723dad79e62..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=4608": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9415037ed7f7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=49152": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 13eb0b2756ae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index dc583685ab02..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=5120": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 23fb68ff8153..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=5504": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b085e5316888..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=5632": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 812297caf2a0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=6144": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 54d92a85b1f6..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=6400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 405c2e868728..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=64000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c66a7e5f0ccf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6747ed1a08c7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=6848": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 31651fc1faa2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=6912": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ed2fa7ce6d18..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=7168": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b5546e695dc9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=8192": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7c44a9ae81b9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=80,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=80,hidden_size=9216": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f39f6414abd3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=1024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 841902cc2ff0..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index f7aa2b2768cb..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=102400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 07e42e00e844..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=102656": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 830764450db5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=11008": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3a5efc527c8b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=1152": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index becb0e603976..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=128": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b44b8c3d8180..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=1280": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 16,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ddfcfb01c7b7..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6719e38fba98..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 0f52b8c9cc0f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=13824": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9cc14f02017d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=14336": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c50b3242921a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=1536": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7e98f105086a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=15360": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index aa6bee2870fc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=2048": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 21096640cb13..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=22016": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a50aa2e0363c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=2304": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c3364686564e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b7ff93819113..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8d10bf69031c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=2560": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 096b8d320b72..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=27392": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 684d38d2811c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=2752": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2bd01194d5ce..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=27648": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a7d00956f02c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=3072": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 886d8ea5f4ab..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=32000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d8e6b13dbe92..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=32256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c62e742fa961..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=32512": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e1d739d0a49d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=32768": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 601d128de45a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=33024": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5dd67e2690f9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b7d9fe07cfb5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=3456": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ca7f97699ec8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 005f4af2dd66..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=36864": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b97db9b80ccc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=4096": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fab5f7de4715..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=43264": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 78ca804fa3d1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=4608": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 524c1118598f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=49152": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 387b87a065a8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 34ce46ce03d2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=5120": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 148080894721..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=5504": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c75811679466..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=5632": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fdc0c3cfa0dd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=6144": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c86867594102..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=6400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8e4ca4b8d8a1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=64000": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 206e6e2d37e4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ab33b78848ec..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=6848": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e06af8cd8cfd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=6912": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index b775ea143b36..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=7168": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 9300b4bdb8f1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=8192": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3c604544d052..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=88,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=88,hidden_size=9216": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 068b851caa7f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=1024": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 378704ab28cf..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=10240 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=10240": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 90174392ce3e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=102400": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 112850a0b030..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=102656 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=102656": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 77f512f41fb9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=11008 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=11008": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 1f3f1b604a4d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=1152": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3124fa86ce0a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=128": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 1,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fea9bf5bd3a5..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1280 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=1280": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index df330fd200a9..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=128000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3753f062cd04..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=128256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=128256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5aee58b4062e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=13824 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=13824": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2db120babc1f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=14336 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=14336": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 11a26c11166c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=1536 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=1536": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 32,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 100067e93785..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=15360 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=15360": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5d9db82e5288..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2048 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=2048": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index abc9c95e8b22..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=22016 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=22016": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 66be2017f0ae..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2304 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=2304": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 45873b198e73..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=24576 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=24576": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index ff32d7268f4e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=256": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c404185f47ef..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2560 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=2560": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 45dc37cd6c1e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27392 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=27392": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 354f9cda513e..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=2752 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=2752": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3ce20813a940..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=27648 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=27648": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index fcec90b796f8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3072 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=3072": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 06035f8733b3..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=32000": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c5c04329190b..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=32256": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 89bdb176ebc2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=32512": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index faab7a47840a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=32768 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=32768": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 5b56d69c403f..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=33024 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=33024": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 4b0e1c5badfa..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3328 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=3328": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 600943e1897d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3456 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=3456": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6690b75db842..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=3584 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=3584": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index e584bc28dd7d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=36864 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=36864": {
-        "BLOCK_K": 256,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 7b09b5d1a65c..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4096 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=4096": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 98d8cebcbf73..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=43264 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=43264": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c63bbc7f882d..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=4608 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=4608": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a36a9b36aa45..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=49152 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=49152": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index d24898f0d4c4..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=512 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=512": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 56f8409d1aee..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5120 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=5120": {
-        "BLOCK_K": 32,
-        "SPLIT_K": 8,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 589965340a56..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5504 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=5504": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index cf5e15814824..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=5632 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=5632": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 2034cf5ea634..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6144 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=6144": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index a2fb7b122395..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6400 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=6400": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 3128d88fe9bc..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64000 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=64000": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 751604b796cd..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=64256 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=64256": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 2,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index c5d3fc706dc8..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6848 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=6848": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 26748fd2b2b1..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=6912 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=6912": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 8b0abb8d3cb2..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=7168 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=7168": {
-        "BLOCK_K": 128,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 46756b2b589a..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=8192 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=8192": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 8
-    }
-}
diff --git a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json b/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json
deleted file mode 100644
index 6dd33a999d46..000000000000
--- a/vllm/lora/ops/bgmv_configs/op_type=shrink,batchs=96,hidden_size=9216 device_name=NVIDIA_GeForce_RTX_3090.json	
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "batchs=96,hidden_size=9216": {
-        "BLOCK_K": 64,
-        "SPLIT_K": 4,
-        "num_warps": 4
-    }
-}

From b345434581e042a296168a0efb4cca66631af1b9 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 14 Jun 2024 14:18:19 +0800
Subject: [PATCH 22/71] add default config

---
 tests/lora/test_triton_punica.py   |  3 +++
 vllm/lora/ops/bgmv_expand.py       |  5 +++--
 vllm/lora/ops/bgmv_expand_slice.py |  4 +++-
 vllm/lora/ops/bgmv_shrink.py       |  4 +++-
 vllm/lora/ops/utils.py             | 26 ++++++++++----------------
 5 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index a098aba16456..29df528cdf05 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -325,6 +325,7 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
 #     assert_close(our_out_tensor, ref_out_tensor)
 
 
+@pytest.mark.skip("stop")
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -469,6 +470,7 @@ def test_triton_bgmv_punica_bgmv(
     assert_close(our_out_tensor, ref_out_tensor)
 
 
+@pytest.mark.skip("stop")
 @pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", NSLICES)
@@ -547,6 +549,7 @@ def test_sgmv_expand_slice(
     assert_close(our_outputs, ref_outputs)
 
 
+@pytest.mark.skip("stop")
 @pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", NSLICES)
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 04fdd670243d..b977540cbfb4 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -5,10 +5,12 @@
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import Dict, Optional
+
 import torch
 import triton
 import triton.language as tl
-from typing import Dict, Optional
+
 from .utils import get_lora_op_configs
 
 
@@ -137,7 +139,6 @@ def bgmv_expand(
             torch.bfloat16,
     ]:
         CAST_TYPE = True
-    config = {"BLOCK_N": 64, "SPLIT_N": 8}
     batchs = lora_indices_tensor.size(0)
 
     if override_config:
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index becaf4f1ca07..c741d10e9c9d 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -5,10 +5,12 @@
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import Dict, Optional
+
 import torch
 import triton
 import triton.language as tl
-from typing import Any, Dict, Optional
+
 from .utils import get_lora_op_configs
 
 
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index 99b9d7ee5b9f..a7087a96488f 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -5,10 +5,12 @@
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import Dict, Optional
+
 import torch
 import triton
 import triton.language as tl
-from typing import Dict, Optional
+
 from .utils import get_lora_op_configs
 
 
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index f4e71cb110bb..6124916cfd9d 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -1,7 +1,7 @@
 import functools
 import json
 import os
-from typing import Dict, Optional
+from typing import Dict
 
 
 def _get_config_file_name(
@@ -11,16 +11,12 @@ def _get_config_file_name(
 ) -> str:
     # device_name = torch.cuda.get_device_name().replace(" ", "_")
     device_name = "NVIDIA_GeForce_RTX_3090"
-    return (
-        f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} "
-        + f"device_name={device_name}.json"
-    )
+    return (f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} " +
+            f"device_name={device_name}.json")
 
 
 @functools.lru_cache
-def _get_op_configs(
-    op_type: str, batch: int, hidden_size: int
-) -> Optional[Dict[str, int]]:
+def _get_op_configs(op_type: str, batch: int, hidden_size: int):
     FOLDER_NAME = "bgmv_configs"
     json_file_name = _get_config_file_name(op_type, batch, hidden_size)
 
@@ -32,24 +28,22 @@ def _get_op_configs(
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
             tuned_config = json.load(f).get(
-                f"batchs={batch},hidden_size={hidden_size}", None
-            )
+                f"batchs={batch},hidden_size={hidden_size}", None)
             return tuned_config
-    
+
     # If no optimized configuration is available, return None
     return None
 
 
 def _get_default_config(op_type: str, batch: int, hidden_size: int):
     if op_type == "expand":
-        return {"BLOCK_N": 256, "SPLIT_N": 8, "num_warps": 8}
+        return {"BLOCK_N": 256, "SPLIT_N": 64, "num_warps": 8}
     else:
-        return {"BLOCK_K": 32, "SPLIT_K": 64, "num_warps": 8}
+        return {"BLOCK_K": 256, "SPLIT_K": 64, "num_warps": 8}
 
 
-def get_lora_op_configs(
-    op_type: str, batch: int, hidden_size: int
-) -> Dict[str, int]:
+def get_lora_op_configs(op_type: str, batch: int,
+                        hidden_size: int) -> Dict[str, int]:
     config = _get_op_configs(op_type, batch, hidden_size)
     if not config:
         config = _get_default_config(op_type, batch, hidden_size)

From 00e007695c8cfa466f53fa74a0a601aa42a10cd7 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 14 Jun 2024 14:20:54 +0800
Subject: [PATCH 23/71] add default config

---
 tests/lora/test_triton_punica.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 29df528cdf05..1a5fd9e3f4d7 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -325,7 +325,6 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
 #     assert_close(our_out_tensor, ref_out_tensor)
 
 
-@pytest.mark.skip("stop")
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -469,7 +468,6 @@ def test_triton_bgmv_punica_bgmv(
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
 
-
 @pytest.mark.skip("stop")
 @pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@@ -549,7 +547,6 @@ def test_sgmv_expand_slice(
     assert_close(our_outputs, ref_outputs)
 
 
-@pytest.mark.skip("stop")
 @pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", NSLICES)

From f4bd5804a0cc89c773d62de98ca05d8e0f3a7707 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 14 Jun 2024 15:20:16 +0800
Subject: [PATCH 24/71] test conflict

---
 vllm/worker/model_runner.py | 325 ++++++++++++++++++++----------------
 1 file changed, 177 insertions(+), 148 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a3e52a749fb6..476e9ba3bb46 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,5 +1,7 @@
+import gc
 import time
 import warnings
+from collections import defaultdict
 from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
 
 import numpy as np
@@ -11,16 +13,17 @@
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
-from vllm.distributed.communication_op import graph_capture
+from vllm.distributed.parallel_state import graph_capture
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata)
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                         is_pin_memory_available, make_tensor_with_pad)
 
@@ -34,6 +37,7 @@
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
     _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
 ]
+_NUM_WARMUP_ITERS = 2
 
 
 class ModelInput(NamedTuple):
@@ -44,7 +48,7 @@ class ModelInput(NamedTuple):
     query_lens: List[int]
     lora_mapping: Optional[LoRAMapping]
     lora_requests: Set[LoRARequest]
-    multi_modal_input: Optional[torch.Tensor]
+    multi_modal_kwargs: Dict[str, torch.Tensor]
     slot_mapping: torch.Tensor
     num_prefill_tokens: int
     num_decode_tokens: int
@@ -60,7 +64,7 @@ def empty(cls, device):
             query_lens=[],
             lora_mapping=None,
             lora_requests=set(),
-            multi_modal_input=None,
+            multi_modal_kwargs={},
             slot_mapping=torch.empty(0, device=device),
             num_prefill_tokens=0,
             num_decode_tokens=0,
@@ -122,6 +126,16 @@ def __init__(
             self.block_size,
         )
 
+        # Create processor for multi-modal data
+        if self.vision_language_config is not None:
+            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
+                .create_input_processor(
+                    self.model_config,
+                    self.vision_language_config,
+                )
+        else:
+            self.multi_modal_input_processor = None
+
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
         # Set if the backend is flashinfer.
@@ -209,6 +223,16 @@ def save_sharded_state(
             max_size=max_size,
         )
 
+    def save_tensorized_model(
+        self,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        from vllm.model_executor.model_loader.loader import TensorizerLoader
+        TensorizerLoader.save_model(
+            self.model,
+            tensorizer_config=tensorizer_config,
+        )
+
     def get_max_block_per_batch(self) -> int:
         block_size = self.block_size
         return (self.max_seq_len_to_capture + block_size - 1) // block_size
@@ -233,7 +257,6 @@ def _prepare_model_input(
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         lora_index_mapping: List[int] = []
-        batch_lora_index_mapping: List[int] = []
         lora_prompt_mapping: List[int] = []
         lora_requests: Set[LoRARequest] = set()
 
@@ -243,7 +266,8 @@ def _prepare_model_input(
         context_lens: List[int] = []
         query_lens: List[int] = []
         block_tables: List[List[int]] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_kwargs_list: Dict[str,
+                                      List[torch.Tensor]] = defaultdict(list)
         decode_only = True
         num_prefills = 0
         num_prefill_tokens = 0
@@ -270,6 +294,12 @@ def _prepare_model_input(
         if len(seq_group_metadata_list) == 0:
             return ModelInput.empty(self.device)
 
+        if self.sliding_window is not None:
+            sliding_window_blocks = (self.sliding_window + self.block_size -
+                                     1) // self.block_size
+            block_aligned_sliding_window = \
+                sliding_window_blocks * self.block_size
+
         for seq_group_metadata in seq_group_metadata_list:
             seq_ids = list(seq_group_metadata.seq_data.keys())
             is_prompt = seq_group_metadata.is_prompt
@@ -310,6 +340,30 @@ def _prepare_model_input(
                                     and self.sliding_window is None
                                     and is_prompt)
 
+                # These are seq_len/context_len capped to the sliding window.
+                # They are passed to decode kernel.
+                # We still need original seq_len/context_len to compute slot
+                # mapping (and input position) below.
+                curr_sliding_window_blocks = None
+                sliding_seq_len = seq_len
+                sliding_context_len = context_len
+
+                # TODO(sang): This is a hack to make sliding window work with
+                # paged attn. We can remove it if we make paged attn kernel
+                # to properly handle slinding window attn.
+                if (self.sliding_window is not None and not is_prompt):
+                    curr_sliding_window_blocks = sliding_window_blocks
+                    if self.scheduler_config.use_v2_block_manager:
+                        # number of elements in last block
+                        suff_len = seq_len % self.block_size
+                        sliding_seq_len = min(
+                            seq_len, block_aligned_sliding_window + suff_len)
+                        if suff_len > 0:
+                            curr_sliding_window_blocks += 1
+                    else:
+                        sliding_seq_len = min(seq_len, self.sliding_window)
+                    sliding_context_len = sliding_seq_len - 1
+
                 # TODO(sang): Combine chunked prefill and prefix caching by
                 # only allowing multiple of block_size chunk size.
                 # NOTE: This only works for oooooooxxx style attention.
@@ -317,6 +371,13 @@ def _prepare_model_input(
                     assert computed_block_nums is not None
                     context_len = len(computed_block_nums) * self.block_size
                     tokens = tokens[context_len:]
+
+                    # need to think what to set it to when we have both sliding
+                    # window and prefix caching...
+                    assert self.sliding_window is None, \
+                        "Prefix caching is not supported with sliding window"
+                    sliding_context_len = context_len
+
                     if self.attn_backend.get_name() == "flash-attn":
                         # NOTE(woosuk): For flash-attn, the block table should
                         # include the entries for the incoming prefill tokens.
@@ -330,14 +391,9 @@ def _prepare_model_input(
                     if seq_group_metadata.block_tables is not None:
                         # chunked prefill or decode
                         block_table = seq_group_metadata.block_tables[seq_id]
-                        if self.sliding_window is not None:
-                            # chunked prefill doesn't support sliding window.
-                            assert (not self.scheduler_config.
-                                    chunked_prefill_enabled)
-                            sliding_window_blocks = (self.sliding_window //
-                                                     self.block_size)
-                            block_table = block_table[-sliding_window_blocks:]
-
+                        if curr_sliding_window_blocks is not None:
+                            block_table = block_table[
+                                -curr_sliding_window_blocks:]
                         if self.attn_backend.get_name() == "flashinfer":
                             paged_kv_indices.extend(block_table)
                             paged_kv_indptr.append(paged_kv_indptr[-1] +
@@ -355,16 +411,9 @@ def _prepare_model_input(
                     block_table = []
                 block_tables.append(block_table)
 
-                # TODO(sang): This is a hack to make sliding window work with
-                # paged attn. We can remove it if we make paged attn kernel
-                # to properly handle slinding window attn.
-                if (self.sliding_window is not None and not is_prompt):
-                    seq_len = min(seq_len, self.sliding_window)
-                    context_len = seq_len - 1
-
-                seq_lens.append(seq_len)
-                context_lens.append(context_len)
-                query_len = seq_len - context_len
+                seq_lens.append(sliding_seq_len)
+                context_lens.append(sliding_context_len)
+                query_len = sliding_seq_len - sliding_context_len
                 query_lens.append(query_len)
                 input_tokens.extend(tokens)
                 input_positions.extend(list(range(context_len, seq_len)))
@@ -381,23 +430,29 @@ def _prepare_model_input(
                         "seq_len: {}, context_len: {}, query_len: {}".format(
                             seq_len, context_len, query_len))
                     num_decode_tokens += query_len
-                    decode_seq_lens.append(seq_len)
+                    decode_seq_lens.append(sliding_seq_len)
 
                 if lora_id > 0:
                     lora_requests.add(seq_group_metadata.lora_request)
 
                 lora_index_mapping += [lora_id] * query_len
-                batch_lora_index_mapping += [lora_id if lora_id > 0 else -1]
                 lora_prompt_mapping.extend(
                     [lora_id] *
-                    (seq_len -
-                     context_len if seq_group_metadata.sampling_params
+                    (query_len if seq_group_metadata.sampling_params
                      and seq_group_metadata.sampling_params.prompt_logprobs
-                     else 1))
+                     is not None else 1))
+
+                mm_data = seq_group_metadata.multi_modal_data
+                if mm_data is not None:
+                    # Process multi-modal data
+                    if self.multi_modal_input_processor is None:
+                        raise ValueError(
+                            "Multi-modal inputs are only supported by "
+                            "vision language models.")
 
-                if seq_group_metadata.multi_modal_data:
-                    multi_modal_input_list.append(
-                        seq_group_metadata.multi_modal_data.data)
+                    mm_kwargs = self.multi_modal_input_processor(mm_data)
+                    for k, v in mm_kwargs.items():
+                        multi_modal_kwargs_list[k].append(v)
 
                 if _is_block_tables_empty(seq_group_metadata.block_tables):
                     # During memory profiling, the block tables are not
@@ -419,9 +474,10 @@ def _prepare_model_input(
                 start_idx = 0
                 if self.sliding_window is not None:
                     if is_prompt:
-                        assert context_len == 0, (
+                        assert self.scheduler_config.use_v2_block_manager \
+                            or context_len == 0, (
                             "Prefix caching is currently not supported with "
-                            "sliding window attention")
+                            "sliding window attention in V1 block manager")
                     # It is an optimization. When it is decoding, it is always
                     # 0. When prefill, we use it to not write slots to kv cache
                     # to save memory.
@@ -482,29 +538,6 @@ def _prepare_model_input(
             )
         assert max_query_len > 0, ("query_lens: {}".format(query_lens))
 
-        context_lens_tensor = torch.tensor(context_lens,
-                                           dtype=torch.int,
-                                           device=self.device)
-
-        if multi_modal_input_list:
-            assert self.vision_language_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
-
-        seq_lens_tensor = torch.tensor(seq_lens,
-                                       dtype=torch.int,
-                                       device=self.device)
-        query_lens_tensor = torch.tensor(query_lens,
-                                         dtype=torch.long,
-                                         device=self.device)
-        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=self.device)
-
         seq_lens_tensor = torch.tensor(seq_lens,
                                        dtype=torch.int,
                                        device=self.device)
@@ -512,11 +545,6 @@ def _prepare_model_input(
                                     dtype=torch.int32,
                                     device=self.device)
 
-        torch.cumsum(query_lens_tensor,
-                     dim=0,
-                     dtype=query_start_loc.dtype,
-                     out=query_start_loc[1:])
-
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -569,6 +597,21 @@ def _prepare_model_input(
                 seq_start_loc=seq_start_loc,
                 data_type=kv_cache_dtype)
         else:
+            context_lens_tensor = torch.tensor(context_lens,
+                                               dtype=torch.int,
+                                               device=self.device)
+            query_lens_tensor = torch.tensor(query_lens,
+                                             dtype=torch.long,
+                                             device=self.device)
+            query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                          dtype=torch.int32,
+                                          device=self.device)
+
+            torch.cumsum(query_lens_tensor,
+                         dim=0,
+                         dtype=query_start_loc.dtype,
+                         out=query_start_loc[1:])
+
             attn_metadata = self.attn_backend.make_metadata(
                 num_prefills=num_prefills,
                 slot_mapping=slot_mapping_tensor,
@@ -587,12 +630,18 @@ def _prepare_model_input(
             )
 
         if self.lora_config:
-            lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping,
-                                       batch_lora_index_mapping, query_lens,
-                                       bool(attn_metadata.prefill_metadata))
+            lora_mapping = LoRAMapping(
+                lora_index_mapping,
+                lora_prompt_mapping,
+            )
         else:
             lora_mapping = None
 
+        multi_modal_kwargs = {
+            k: torch.cat(v, dim=0).to(self.device)
+            for k, v in multi_modal_kwargs_list.items()
+        }
+
         return ModelInput(
             input_tokens=input_tokens_tensor,
             input_positions=input_positions_tensor,
@@ -601,7 +650,7 @@ def _prepare_model_input(
             query_lens=query_lens,
             lora_mapping=lora_mapping,
             lora_requests=lora_requests,
-            multi_modal_input=multi_modal_input,
+            multi_modal_kwargs=multi_modal_kwargs,
             slot_mapping=slot_mapping_tensor,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
@@ -612,7 +661,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, torch.Tensor]:
+               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             # Prepare input tensors.
@@ -624,7 +673,7 @@ def prepare_input_tensors(
                 query_lens,
                 lora_mapping,
                 lora_requests,
-                multi_modal_input,
+                multi_modal_kwargs,
                 slot_mapping,
                 num_prefill_tokens,
                 num_decode_tokens,
@@ -641,7 +690,7 @@ def prepare_input_tensors(
                 sampling_metadata.selected_token_indices,
                 "lora_requests": lora_requests,
                 "lora_mapping": lora_mapping,
-                "multi_modal_input": multi_modal_input,
+                "multi_modal_kwargs": multi_modal_kwargs,
                 "num_prefill_tokens": num_prefill_tokens,
                 "num_decode_tokens": num_decode_tokens,
                 "slot_mapping": slot_mapping,
@@ -658,7 +707,7 @@ def prepare_input_tensors(
                 "selected_token_indices")
             lora_mapping = metadata_dict.pop("lora_mapping")
             lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
             if metadata_dict:
                 attn_metadata = self.attn_backend.make_metadata(
                     **metadata_dict)
@@ -673,7 +722,7 @@ def prepare_input_tensors(
 
         return (input_tokens, input_positions, attn_metadata,
                 sampling_metadata, lora_requests, lora_mapping,
-                multi_modal_input)
+                multi_modal_kwargs)
 
     @torch.inference_mode()
     def execute_model(
@@ -682,7 +731,7 @@ def execute_model(
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
         (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests, lora_mapping, multi_modal_input
+         lora_requests, lora_mapping, multi_modal_kwargs
          ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         if self.lora_config:
@@ -696,15 +745,14 @@ def execute_model(
             model_executable = self.graph_runners[graph_batch_size]
         else:
             model_executable = self.model
-        execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
-        }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
-        hidden_states = model_executable(**execute_model_kwargs)
+
+        hidden_states = model_executable(
+            input_ids=input_tokens,
+            positions=input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            **multi_modal_kwargs,
+        )
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
@@ -760,16 +808,24 @@ def profile_run(self) -> None:
         # To exercise the worst scenario for GPU memory consumption,
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
-        if self.vision_language_config:
+        model_config = self.model_config
+        vlm_config = self.vision_language_config
+
+        if vlm_config:
             max_num_seqs = min(
                 max_num_seqs,
-                int(max_num_batched_tokens /
-                    self.vision_language_config.image_feature_size))
+                int(max_num_batched_tokens / vlm_config.image_feature_size))
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
-            seq_data, fake_multi_modal_input = _prepare_fake_inputs(
-                seq_len, self.vision_language_config)
+
+            if vlm_config is None:
+                seq_data = SequenceData([0] * seq_len)
+                dummy_multi_modal_data = None
+            else:
+                seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
+                    .dummy_data_for_profiling(seq_len, model_config, vlm_config)
+
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
@@ -778,7 +834,7 @@ def profile_run(self) -> None:
                 block_tables=None,
                 lora_request=dummy_lora_requests_per_seq[group_id]
                 if dummy_lora_requests_per_seq else None,
-                multi_modal_data=fake_multi_modal_input,
+                multi_modal_data=dummy_multi_modal_data,
             )
             seqs.append(seq)
 
@@ -789,32 +845,6 @@ def profile_run(self) -> None:
         torch.cuda.synchronize()
         return
 
-    # def compose_lora_kernel_meta(
-    #     self,
-    #     attn_metadata: AttentionMetadata,
-    # ) -> LoRAKernelMeta:
-    #     if attn_metadata.prefill_metadata:
-    #         max_seq_len = attn_metadata.max_query_len
-    #         seq_start_loc = attn_metadata.query_start_loc
-    #         seq_lens_tensor = attn_metadata.seq_lens_tensor
-    #         batch_size = attn_metadata.num_prefills
-    #     else:
-    #         max_seq_len = attn_metadata.max_query_len
-    #         seq_start_loc = attn_metadata.query_start_loc
-    #         batch_size = attn_metadata.decode_metadata.num_decode_tokens
-    #         seq_lens_tensor = torch.ones((batch_size),
-    #                                      dtype=torch.long,
-    #                                      device=self.device)
-
-    #     if batch_size == 0:
-    #         print("sssss")
-    #     # lora_index_lst = lora_mapping.batch_mapping
-    #     # lora_index_tensor = torch.tensor(lora_index_lst,
-    #     #                                  dtype=torch.long,
-    #     #                                  device=self.device)
-    #     return LoRAKernelMeta(batch_size, max_seq_len, seq_lens_tensor,
-    #                           seq_start_loc)
-
     def remove_all_loras(self):
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
@@ -876,6 +906,10 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
 
+        # Prepare buffer for outputs. These will be reused for all batch sizes.
+        # It will be filled after the first graph capture.
+        hidden_states: Optional[torch.Tensor] = None
+
         graph_batch_size = _get_graph_batch_size(
             self.scheduler_config.max_num_seqs)
         batch_size_capture_list = [
@@ -905,16 +939,18 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                 )
 
                 if self.lora_config:
-                    lora_mapping = LoRAMapping([0] * batch_size,
-                                               [0] * batch_size,
-                                               [0] * batch_size,
-                                               [1] * batch_size, False)
+                    lora_mapping = LoRAMapping(
+                        [0] * batch_size,
+                        [0] * batch_size,
+                    )
                     self.set_active_loras(set(), lora_mapping)
 
                 graph_runner = CUDAGraphRunner(self.model)
-                graph_runner.capture(
+                hidden_states = graph_runner.capture(
                     input_tokens[:batch_size],
                     input_positions[:batch_size],
+                    hidden_states[:batch_size]
+                    if hidden_states is not None else None,
                     kv_caches,
                     attn_metadata,
                     memory_pool=self.graph_memory_pool,
@@ -951,35 +987,46 @@ def capture(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
+        hidden_states: Optional[torch.Tensor],
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         memory_pool: Optional[Tuple[int, int]],
         stream: torch.cuda.Stream,
         **kwargs,
-    ) -> None:
+    ) -> torch.Tensor:
         assert self._graph is None
-        # Run the model once without capturing the graph.
+        # Run the model a few times without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-            **kwargs,
-        )
+        # Note one iteration is not enough for torch.jit.script
+        for _ in range(_NUM_WARMUP_ITERS):
+            self.model(
+                input_ids,
+                positions,
+                kv_caches,
+                attn_metadata,
+                **kwargs,
+            )
         torch.cuda.synchronize()
 
         # Capture the graph.
         self._graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
-            hidden_states = self.model(
+            output_hidden_states = self.model(
                 input_ids,
                 positions,
                 kv_caches,
                 attn_metadata,
                 **kwargs,
             )
+            if hidden_states is not None:
+                hidden_states.copy_(output_hidden_states)
+            else:
+                hidden_states = output_hidden_states
+            del output_hidden_states
+            # make sure `output_hidden_states` is deleted
+            # in the graph's memory pool
+            gc.collect()
         torch.cuda.synchronize()
 
         # Save the input and output buffers.
@@ -992,7 +1039,7 @@ def capture(
             "block_tables": attn_metadata.decode_metadata.block_tables,
         }
         self.output_buffers = {"hidden_states": hidden_states}
-        return
+        return hidden_states
 
     def forward(
         self,
@@ -1039,24 +1086,6 @@ def _get_graph_batch_size(batch_size: int) -> int:
                 _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
 
 
-def _prepare_fake_inputs(
-        seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
-    """Prepare fake inputs for profile run."""
-    if vision_language_config:
-        prompt_tokens = [
-            vision_language_config.image_token_id
-        ] * vision_language_config.image_feature_size + [0] * (
-            seq_len - vision_language_config.image_feature_size)
-        fake_image_input = MultiModalData(
-            type=MultiModalData.Type.IMAGE,
-            data=torch.zeros(vision_language_config.image_input_shape,
-                             dtype=torch.float16))
-    else:
-        prompt_tokens = [0] * seq_len
-        fake_image_input = None
-    return SequenceData(prompt_tokens), fake_image_input
-
-
 def _is_block_tables_empty(block_tables: Union[None, Dict]):
     """
     Check if block_tables is None or a dictionary with all None values.

From 2bc0668bde9d8931bd6ec38d21558aeffc2f1a81 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 18 Jun 2024 11:33:12 +0800
Subject: [PATCH 25/71] trigger testing

---
 tests/lora/test_triton_punica.py  |   2 +-
 vllm/lora/fully_sharded_layers.py | 238 +++++++++---
 vllm/lora/layers.py               | 591 ++++++++++++++++--------------
 vllm/lora/models.py               | 293 ++++++++-------
 vllm/lora/punica.py               | 385 +++++++++----------
 vllm/worker/model_runner.py       | 294 ++++++---------
 6 files changed, 950 insertions(+), 853 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 1a5fd9e3f4d7..a098aba16456 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -468,7 +468,7 @@ def test_triton_bgmv_punica_bgmv(
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
 
-@pytest.mark.skip("stop")
+
 @pytest.mark.parametrize("batchs", BATCHS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", NSLICES)
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index ffdc32b7339a..fbea667a215e 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -7,13 +7,22 @@
 
 from vllm.config import LoRAConfig
 from vllm.distributed.communication_op import (
-    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
-from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
-                              MergedColumnParallelLinearWithLoRA,
-                              MergedQKVParallelLinearWithLora,
-                              RowParallelLinearWithLoRA)
+from vllm.lora.layers import (
+    ColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithLora,
+    RowParallelLinearWithLoRA,
+)
 from vllm.lora.punica import bgmv, dispatch_bgmv_low_level
+from vllm.lora.punica import (
+    add_shrink_triton,
+    add_expand_triton,
+    add_expand_slice_triton,
+)
 
 if TYPE_CHECKING:
     pass
@@ -27,7 +36,7 @@ def _fully_sharded_can_replace(can_replace):
 
     def dec(*args, **kwargs):
         return (can_replace(*args, **kwargs)
-                and kwargs['lora_config'].fully_sharded_loras)
+                and kwargs["lora_config"].fully_sharded_loras)
 
     return dec
 
@@ -58,15 +67,49 @@ def apply(self, x: torch.Tensor,
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
+        buffer = torch.zeros(
+            (x.shape[0], self.lora_a_stacked.shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
 
-        bgmv(buffer, x, self.lora_a_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        # bgmv(
+        #     buffer,
+        #     x,
+        #     self.lora_a_stacked,
+        #     self.indices[: self.indices_len[0]],
+        #     0,
+        #     1.0,
+        # )
+        token_num = self.indices_len[0]
+        is_prefilling = bool(self.indices_len[4])
+        add_shrink_triton(
+            buffer,
+            x,
+            self.lora_a_stacked,
+            self.indices[:token_num],
+            0,
+            1.0,
+            is_prefilling,
+        )
         buffer = tensor_model_parallel_all_gather(buffer)
-        bgmv(output, buffer, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        # bgmv(
+        #     output,
+        #     buffer,
+        #     self.lora_b_stacked,
+        #     self.indices[: self.indices_len[0]],
+        #     0,
+        #     1.0,
+        # )
+        add_expand_triton(
+            output,
+            buffer,
+            self.lora_b_stacked,
+            self.indices[:token_num],
+            0,
+            is_prefilling,
+            add_input=True,
+        )
         # now have column partitioned output
 
         output = output.view(*out_orig_shape)
@@ -74,9 +117,13 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -89,12 +136,12 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 def _mcp_apply(x, bias, layer):
     """
-    MergedColumnParallelLinearWithShardedLoRA and 
-    QKVParallelLinearWithShardedLora share the same 
+    MergedColumnParallelLinearWithShardedLoRA and
+    QKVParallelLinearWithShardedLora share the same
     LoRa weight application method.
-    
+
     The main difference is the step by shard_size for lora_b which can
-    vary for QKVParallelLinearWithShardedLora but is constant for 
+    vary for QKVParallelLinearWithShardedLora but is constant for
     MergedColumnParallelLinearWithShardedLoRA.
     """
     # expecting 2 for column parallel and 3 for qkv
@@ -103,21 +150,58 @@ def _mcp_apply(x, bias, layer):
 
     x = x.view(-1, x.shape[-1])
     output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
-    buffers = torch.zeros((n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
-                          dtype=torch.float32,
-                          device=x.device)
+    buffers = torch.zeros(
+        (n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    token_num = layer.indices_len[0]
+    is_prefilling = bool(layer.indices_len[4])
     for idx in range(n):
-        bgmv(buffers[idx], x, layer.lora_a_stacked[idx],
-             layer.indices[:layer.indices_len[0]], 0, 1.0)
+        # bgmv(
+        #     buffers[idx],
+        #     x,
+        #     layer.lora_a_stacked[idx],
+        #     layer.indices[: layer.indices_len[0]],
+        #     0,
+        #     1.0,
+        # )
+
+        add_shrink_triton(
+            buffers[idx],
+            x,
+            layer.lora_a_stacked[idx],
+            layer.indices[:token_num],
+            0,
+            1.0,
+            is_prefilling,
+        )
 
     buffers = tensor_model_parallel_all_gather(buffers)
     left_offset = 0
     for idx in range(n):
         shard_size = layer.lora_b_stacked[idx].shape[2]
-        dispatch_bgmv_low_level(output, buffers[idx],
-                                layer.lora_b_stacked[idx],
-                                layer.indices[:layer.indices_len[0]], 0, 1.0,
-                                left_offset, shard_size)
+        # dispatch_bgmv_low_level(
+        #     output,
+        #     buffers[idx],
+        #     layer.lora_b_stacked[idx],
+        #     layer.indices[: layer.indices_len[0]],
+        #     0,
+        #     1.0,
+        #     left_offset,
+        #     shard_size,
+        # )
+        add_expand_slice_triton(
+            output,
+            buffers[idx],
+            layer.lora_b_stacked[idx],
+            layer.indices[:layer.indices_len[0]],
+            0,
+            is_prefilling,
+            left_offset,
+            shard_size,
+            add_input=True,
+        )
         left_offset += shard_size
 
     output = output.view(*out_orig_shape)
@@ -128,7 +212,7 @@ def _mcp_apply(x, bias, layer):
 class MergedColumnParallelLinearWithShardedLoRA(
         MergedColumnParallelLinearWithLoRA):
     """
-    Differs from MergedColumnParallelLinearWithLoRA by slicing the 
+    Differs from MergedColumnParallelLinearWithLoRA by slicing the
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
@@ -144,7 +228,8 @@ def slice_lora_a(
         lora_a = [
             lora_a[0][:,
                       output_start_idx:output_start_idx + output_shard_size],
-            lora_a[1][:, output_start_idx:output_start_idx + output_shard_size]
+            lora_a[1][:,
+                      output_start_idx:output_start_idx + output_shard_size],
         ]
         return lora_a
 
@@ -154,9 +239,13 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -169,7 +258,7 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
     """
-    Differs from QKVParallelLinearWithLora by slicing the 
+    Differs from QKVParallelLinearWithLora by slicing the
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
@@ -185,7 +274,7 @@ def slice_lora_a(
         lora_a = [
             lora_a[0][:, start_idx[0]:start_idx[0] + shard_size[0]],
             lora_a[1][:, start_idx[1]:start_idx[1] + shard_size[1]],
-            lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]]
+            lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]],
         ]
         return lora_a
 
@@ -195,9 +284,13 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
@@ -210,11 +303,11 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
     """
-    Differs from RowParallelLinearWithLoRA by slicing the 
+    Differs from RowParallelLinearWithLoRA by slicing the
     LoRA B's also.
 
     Based on S-LoRA, slicing happens along the output dim.
-    This yields a combined partial sum from the row parallel base 
+    This yields a combined partial sum from the row parallel base
     layer and column partitioned output from the LoRA.
     """
 
@@ -231,11 +324,30 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
-        bgmv(buffer, x, self.lora_a_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        buffer = torch.zeros(
+            (x.shape[0], self.lora_a_stacked.shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+        # bgmv(
+        #     buffer,
+        #     x,
+        #     self.lora_a_stacked,
+        #     self.indices[: self.indices_len[0]],
+        #     0,
+        #     1.0,
+        # )
+        token_num = self.indices_len[0]
+        is_prefilling = bool(self.indices_len[4])
+        add_shrink_triton(
+            buffer,
+            x,
+            self.lora_a_stacked,
+            self.indices[:token_num],
+            0,
+            1.0,
+            is_prefilling,
+        )
         buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -246,18 +358,38 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # reduced before being used
         shard_size = self.lora_b_stacked.shape[2]
         start_idx = self.tp_rank * shard_size
-        dispatch_bgmv_low_level(output, buffer, self.lora_b_stacked,
-                                self.indices[:self.indices_len[0]], 0, 1.0,
-                                start_idx, shard_size)
-
+        # dispatch_bgmv_low_level(
+        #     output,
+        #     buffer,
+        #     self.lora_b_stacked,
+        #     self.indices[: self.indices_len[0]],
+        #     0,
+        #     1.0,
+        #     start_idx,
+        #     shard_size,
+        # )
+        add_expand_slice_triton(
+            output,
+            buffer,
+            self.lora_b_stacked,
+            self.indices[:self.indices_len[0]],
+            0,
+            is_prefilling,
+            start_idx,
+            shard_size,
+        )
         output = output.view(*out_orig_shape)
         return output
 
     @classmethod
     @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
             source_layer=source_layer,
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 96b37ab8880c..1dd89df3c4f6 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -9,24 +9,34 @@
 from transformers import PretrainedConfig
 
 from vllm.config import LoRAConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              split_tensor_along_last_dim,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce,
-                              tensor_model_parallel_gather)
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+    tensor_model_parallel_gather,
+)
 from vllm.distributed.utils import divide
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.punica import add_lora_triton, add_lora_triton_slice
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+
+# from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.punica import (
+    add_lora_triton,
+    add_expand_triton,
+)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
-    LinearScalingRotaryEmbedding, RotaryEmbedding)
+    LinearScalingRotaryEmbedding,
+    RotaryEmbedding,
+)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+    VocabParallelEmbedding, )
 
 if TYPE_CHECKING:
     pass
@@ -55,26 +65,49 @@ def _not_fully_sharded_can_replace(can_replace):
     """
 
     def dec(*args, **kwargs):
-        decorate = kwargs.pop('decorate') if 'decorate' in kwargs else True
-        condition = (not kwargs['lora_config'].fully_sharded_loras
+        decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
+        condition = (not kwargs["lora_config"].fully_sharded_loras
                      if decorate else True)
         return can_replace(*args, **kwargs) and condition
 
     return dec
 
 
+def _apply_expand_triton(
+    x: torch.Tensor,
+    lora_b_stacked: torch.Tensor,
+    lora_index_tensor: torch.Tensor,
+    indices_info: List[int],
+    output: torch.Tensor,
+    add_input: bool = True,
+) -> torch.Tensor:
+    org_output = output
+    x = x.view(-1, x.shape[-1])
+    output = output.view(-1, output.shape[-1])
+    token_num = indices_info[0]
+    is_prefilling = bool(indices_info[4])
+    add_expand_triton(
+        output,
+        x,
+        lora_b_stacked,
+        lora_index_tensor[:token_num],
+        0,
+        is_prefilling,
+        add_input,
+    )
+    return output.view_as(org_output)
+
+
 def _apply_lora_triton(
     x: torch.Tensor,
     lora_a_stacked: torch.Tensor,
     lora_b_stacked: torch.Tensor,
-    b_seq_start_tensor: torch.Tensor,
-    seq_length_tensor: torch.Tensor,
     lora_index_tensor: torch.Tensor,
-    batch_mlen_stage_lst: List[int],
+    indices_info: List[int],
     output: torch.Tensor,
 ) -> torch.Tensor:
-    """Applies lora to each input.   This method applies all loras to each 
-    input. It uses the `lora_index_tensor` vector to determine which lora 
+    """Applies lora to each input.   This method applies all loras to each
+    input. It uses the `lora_index_tensor` vector to determine which lora
     yields the correct output. An index of -1 means no lora should be
     applied. This method adds the final lora results to the output.
 
@@ -82,35 +115,33 @@ def _apply_lora_triton(
         x (torch.Tensor): (batch_size, hidden_dim)
         lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim)
         lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank)
-        b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g.,if the sequence length is [4, 6], it is
-            [0, 4].
-        seq_length_tensor (torch.Tensor): batch_size,). record the sequence
-            length of the sequences in the batch
-        lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
-        batch_mlen_stage_lst (List[int]): (3,).Sequentially represent batch
-            size, maximum seq length, and prefilling stage flag.
-        output (torch.Tensor):  (batch_size, output_dim)  
-        
+        lora_index_tensor (torch.Tensor): (batch_size*seq_number,). The LoRA
+        index corresponding to each token
+        indices_info: List[int]: 5 is the number of indicies tensors.
+        # base_indices, sampler_indices, sampler_indices_padded,
+        # embeddings_indices,prefilling or decoding
+        output (torch.Tensor):  (batch_size, output_dim)
+
     Returns:
-        output (torch.Tensor):  (batch_size, output_dim)  
-        
+        output (torch.Tensor):  (batch_size*seq_number, output_dim)
+
     """
     org_output = output
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
-    #
-    batch_size = batch_mlen_stage_lst[0]
-    max_length = batch_mlen_stage_lst[1]
-    is_prefilling = bool(batch_mlen_stage_lst[2])
-
-    add_lora_triton(output, x, lora_a_stacked, lora_b_stacked,
-                    b_seq_start_tensor[:batch_size],
-                    seq_length_tensor[:batch_size],
-                    lora_index_tensor[:batch_size], batch_size, max_length, 0,
-                    1.0, is_prefilling)
+
+    token_num = indices_info[0]
+    is_prefilling = bool(indices_info[4])
+    add_lora_triton(
+        output,
+        x,
+        lora_a_stacked,
+        lora_b_stacked,
+        lora_index_tensor[:token_num],
+        0,
+        1.0,
+        is_prefilling,
+    )
     return output.view_as(org_output)
 
 
@@ -118,52 +149,46 @@ def _apply_lora_triton_nslice(
     x: torch.Tensor,
     lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
     lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    b_seq_start_tensor: torch.Tensor,
-    seq_length_tensor: torch.Tensor,
     lora_index_tensor: torch.Tensor,
-    batch_mlen_stage_lst: List[int],
+    indices_info: List[int],
     output: torch.Tensor,
     output_slices: Tuple[int, ...],
 ) -> torch.Tensor:
-    """Applies lora to each input.  This method applies all loras to each 
-    input. It uses the `lora_index_tensor` vector to determine which lora 
-    yields the correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the output.
+    """_summary_
 
     Args:
-        x (torch.Tensor): (batch_size, hidden_dim)
-        lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank)
-        b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g.,if the sequence length is [4, 6], it is
-            [0, 4].
-        seq_length_tensor (torch.Tensor): batch_size,). record the sequence
-            length of the sequences in the batch
-        lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
-        batch_mlen_stage_lst (List[int]): (3,).Sequentially represent batch
-            size, maximum seq length, and prefilling stage flag.
-        output_slices (Tuple[int, ...]): Size of each output column 
+        x (torch.Tensor): _description_
+        lora_a_stacked (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): _description_
+        lora_b_stacked (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): _description_
+        lora_index_tensor (torch.Tensor): _description_
+        indices_info (List[int]): _description_
+        output (torch.Tensor): _description_
+        output_slices (Tuple[int, ...]): _description_
 
     Returns:
-        output (torch.Tensor):  (batch_size, output_dim) 
+        torch.Tensor: _description_
     """
     org_output = output
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
 
-    batch_size = batch_mlen_stage_lst[0]
-    max_length = batch_mlen_stage_lst[1]
-    is_prefilling = bool(batch_mlen_stage_lst[2])
+    token_num = indices_info[0]
+    is_prefilling = bool(indices_info[4])
     offset_left = 0
-    #TODO fuse these kernel
+    # TODO fuse these kernels
     for slice_idx in range(len(output_slices)):
-        add_lora_triton_slice(
-            output, x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx],
-            b_seq_start_tensor[:batch_size], seq_length_tensor[:batch_size],
-            lora_index_tensor[:batch_size], batch_size, max_length, 0, 1.0,
-            offset_left, output_slices[slice_idx], is_prefilling)
+        add_lora_triton(
+            output,
+            x,
+            lora_a_stacked[slice_idx],
+            lora_b_stacked[slice_idx],
+            lora_index_tensor[:token_num],
+            0,
+            1.0,
+            is_prefilling,
+            offset_left,
+            output_slices[slice_idx],
+        )
         offset_left += output_slices[slice_idx]
 
     return output.view_as(org_output)
@@ -175,10 +200,6 @@ class LoRAMapping:
     index_mapping: Tuple[int, ...]
     # Per sampled token:
     prompt_mapping: Tuple[int, ...]
-    # Per batch lora index
-    batch_mapping: List[int] = field(default_factory=list)
-    # Per batch seq length
-    seq_lens: List[int] = field(default_factory=list)
     # prefilling or  decoding.
     is_prefilling: bool = False
 
@@ -202,10 +223,11 @@ def slice_lora_b(
         ...
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         """Initializes lora matrices."""
         ...
 
@@ -224,18 +246,25 @@ def set_lora(
         ...
 
     def set_mapping(
-            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
-            sampler_indices_padded: torch.Tensor,
-            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
-            indices_len: List[int], seq_length_tensor: torch.Tensor,
-            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
+        self,
+        base_indices: torch.Tensor,
+        sampler_indices: torch.Tensor,
+        sampler_indices_padded: torch.Tensor,
+        embeddings_indices: torch.Tensor,
+        long_lora_indices: torch.Tensor,
+        indices_len: List[int],
+    ):
         """Sets the mapping indices."""
         ...
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
 
@@ -249,22 +278,23 @@ def __init__(self, base_layer: VocabParallelEmbedding) -> None:
         self.embeddings_weights: Optional[torch.Tensor]
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
-
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         lora_vocab_start_idx = self.base_layer.org_vocab_size
         weights_idx = None
         if self.base_layer.vocab_end_index > lora_vocab_start_idx:
             # We can start adding lora weights
             weights_idx = max(
                 lora_vocab_start_idx - self.base_layer.vocab_start_index, 0)
-            self.embeddings_slice = (self.base_layer.vocab_start_index -
-                                     self.base_layer.org_vocab_size +
-                                     weights_idx,
-                                     self.base_layer.vocab_end_index -
-                                     self.base_layer.org_vocab_size)
+            self.embeddings_slice = (
+                self.base_layer.vocab_start_index -
+                self.base_layer.org_vocab_size + weights_idx,
+                self.base_layer.vocab_end_index -
+                self.base_layer.org_vocab_size,
+            )
             self.embeddings_weights = self.base_layer.weight.data[weights_idx:]
             self.embeddings_weights.fill_(0)
         else:
@@ -309,10 +339,6 @@ def create_lora_weights(
         self.indices_len: List[int]
         self.embeddings_indices: torch.Tensor
 
-        self.seq_length_tensor: torch.Tensor
-        self.b_seq_start_tensor: torch.Tensor
-        self.batch_mlen_stage_lst: List[int]
-
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -334,30 +360,30 @@ def set_lora(
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1]].copy_(embeddings_tensor, non_blocking=True)
+                shape[1], ].copy_(embeddings_tensor, non_blocking=True)
             if self.embeddings_slice is not None:
                 # TODO(yard1): Optimize this copy, we don't need to copy
                 # everything, just the modified part
                 embeddings = self.embeddings_tensors.view(
                     self.embeddings_tensors.shape[0] *
                     self.embeddings_tensors.shape[1],
-                    self.embeddings_tensors.shape[2]
+                    self.embeddings_tensors.shape[2],
                 )[self.embeddings_slice[0]:self.embeddings_slice[1]]
                 assert self.embeddings_weights is not None
                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
 
     def set_mapping(
-            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
-            sampler_indices_padded: torch.Tensor,
-            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
-            indices_len: List[int], seq_length_tensor: torch.Tensor,
-            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
+        self,
+        base_indices: torch.Tensor,
+        sampler_indices: torch.Tensor,
+        sampler_indices_padded: torch.Tensor,
+        embeddings_indices: torch.Tensor,
+        long_lora_indices: torch.Tensor,
+        indices_len: List[int],
+    ):
         self.indices = base_indices
         self.embeddings_indices = embeddings_indices
         self.indices_len = indices_len
-        self.seq_length_tensor = seq_length_tensor
-        self.b_seq_start_tensor = b_seq_start_tensor
-        self.batch_mlen_stage_lst = batch_mlen_stage_lst
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
@@ -378,34 +404,34 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if full_lora_a_embeddings.ndim == 3:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
                 full_lora_a_embeddings.shape[0] *
-                full_lora_a_embeddings.shape[1], -1)
-        batch_size, max_length = self.batch_mlen_stage_lst[
-            0], self.batch_mlen_stage_lst[1]
-
-        sgmv_expand(
+                full_lora_a_embeddings.shape[1],
+                -1,
+            )
+        _apply_expand_triton(
             full_lora_a_embeddings,
             self.lora_b_stacked,
+            self.indices,
+            self.indices_len,
             full_output,
-            self.b_seq_start_tensor[:batch_size],
-            self.seq_length_tensor[:batch_size],
-            self.indices[:batch_size],
-            batch_size,
-            max_length,
-            True,
+            add_input=True,
         )
         return full_output.view_as(full_output_org)
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
 
 class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
-    
+
     LoRA B is sliced for tensor parallelism.
     """
 
@@ -418,10 +444,11 @@ def __init__(self, base_layer: ColumnParallelLinear) -> None:
         self.device = _get_lora_device(self.base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_size = get_tensor_model_parallel_world_size()
         lora_a_output_size_per_partition = (
@@ -448,9 +475,6 @@ def create_lora_weights(
         # lazily initialized.
         self.indices: torch.Tensor
         self.indices_len: List[int]
-        self.seq_length_tensor: torch.Tensor
-        self.b_seq_start_tensor: torch.Tensor
-        self.batch_mlen_stage_lst: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -488,23 +512,28 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
 
     def set_mapping(
-            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
-            sampler_indices_padded: torch.Tensor,
-            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
-            indices_len: List[int], seq_length_tensor: torch.Tensor,
-            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
+        self,
+        base_indices: torch.Tensor,
+        sampler_indices: torch.Tensor,
+        sampler_indices_padded: torch.Tensor,
+        embeddings_indices: torch.Tensor,
+        long_lora_indices: torch.Tensor,
+        indices_len: List[int],
+    ):
         self.indices = base_indices
         self.indices_len = indices_len
-        self.seq_length_tensor = seq_length_tensor
-        self.b_seq_start_tensor = b_seq_start_tensor
-        self.batch_mlen_stage_lst = batch_mlen_stage_lst
 
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
-                           self.b_seq_start_tensor, self.seq_length_tensor,
-                           self.indices, self.batch_mlen_stage_lst, output)
+        _apply_lora_triton(
+            x,
+            self.lora_a_stacked,
+            self.lora_b_stacked,
+            self.indices,
+            self.indices_len,
+            output,
+        )
         return output
 
     def forward(self, input_):
@@ -533,9 +562,13 @@ def forward(self, input_):
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
             and len(packed_modules_list) == 1)
@@ -554,10 +587,11 @@ def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
         super().__init__(base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         n_slices = 2
         if not (len(self.base_layer.output_sizes) == n_slices
@@ -597,11 +631,6 @@ def create_lora_weights(
         self.indices: torch.Tensor
         self.indices_len: torch.Tensor
 
-        self.seq_length_tensor: torch.Tensor
-        self.b_seq_start_tensor: torch.Tensor
-        self.lora_index_tensor: torch.Tensor
-        self.batch_mlen_stage_lst: List[int]
-
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
         self.lora_a_stacked[1][index] = 0
@@ -622,7 +651,8 @@ def slice_lora_b(
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         lora_b = [
-            lora_b[0][:, start_idx:end_idx], lora_b[1][:, start_idx:end_idx]
+            lora_b[0][:, start_idx:end_idx],
+            lora_b[1][:, start_idx:end_idx],
         ]
         return lora_b
 
@@ -661,10 +691,8 @@ def apply(self, x: torch.Tensor,
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
-            self.b_seq_start_tensor,
-            self.seq_length_tensor,
             self.indices,
-            self.batch_mlen_stage_lst,
+            self.indices_len,
             output,
             (self.output_dim, self.output_dim),
         )
@@ -672,22 +700,26 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is MergedColumnParallelLinear and len(
-            packed_modules_list) == 2
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is MergedColumnParallelLinear
+                and len(packed_modules_list) == 2)
 
 
 class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
     """
-    ColumnParallelLinear layer that is specifically designed for  
-    qkv_proj. Certain models, such as chtglm3 and baichuan-7b,  
-    only contains a single LoRA within their qkv_proj layer. 
+    ColumnParallelLinear layer that is specifically designed for
+    qkv_proj. Certain models, such as chtglm3 and baichuan-7b,
+    only contains a single LoRA within their qkv_proj layer.
 
-    During inference with Tensor Parallel, the weights of lora_b 
+    During inference with Tensor Parallel, the weights of lora_b
     must be accurately partitioned according to the respective ranks.
-    
+
     Q slice may have different shape than K and V slices (which both have
     the same shape).
     """
@@ -718,15 +750,17 @@ def set_lora(
             self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
             lora_b_q = lora_b[:, self.q_proj_shard_size *
                               self.q_shard_id:self.q_proj_shard_size *
-                              (self.q_shard_id + 1)]
+                              (self.q_shard_id + 1), ]
             k_offset = self.q_proj_total_size
             lora_b_k = lora_b[:, k_offset + self.kv_proj_shard_size *
                               self.kv_shard_id:k_offset +
-                              self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+                              self.kv_proj_shard_size *
+                              (self.kv_shard_id + 1), ]
             v_offset = k_offset + self.kv_proj_total_size
             lora_b_v = lora_b[:, v_offset + self.kv_proj_shard_size *
                               self.kv_shard_id:v_offset +
-                              self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+                              self.kv_proj_shard_size *
+                              (self.kv_shard_id + 1), ]
             lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
 
         self.lora_a_stacked[index,
@@ -737,11 +771,15 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is QKVParallelLinear and len(
-            packed_modules_list) == 1
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is QKVParallelLinear
+                and len(packed_modules_list) == 1)
 
 
 class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
@@ -759,10 +797,11 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -830,18 +869,17 @@ def create_lora_weights(
             ),
         )
 
-        self.output_slices = (self.q_proj_shard_size, self.kv_proj_shard_size,
-                              self.kv_proj_shard_size)
+        self.output_slices = (
+            self.q_proj_shard_size,
+            self.kv_proj_shard_size,
+            self.kv_proj_shard_size,
+        )
         self.packed_indices: Optional[torch.Tensor] = None
         self.standard_indices: Optional[torch.Tensor] = None
         # lazily initialized.
         self.indices: torch.Tensor
         self.indices_len: List[int]
 
-        self.seq_length_tensor: torch.Tensor
-        self.b_seq_start_tensor: torch.Tensor
-        self.batch_mlen_stage_lst: List[int]
-
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
         self.lora_b_stacked[0][index] = 0
@@ -862,15 +900,15 @@ def slice_lora_b(
         if lora_b[0] is not None:
             lora_b_q = lora_b[0][:, self.q_proj_shard_size *
                                  self.q_shard_id:self.q_proj_shard_size *
-                                 (self.q_shard_id + 1)]
+                                 (self.q_shard_id + 1), ]
         if lora_b[1] is not None:
             lora_b_k = lora_b[1][:, self.kv_proj_shard_size *
                                  self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1)]
+                                 (self.kv_shard_id + 1), ]
         if lora_b[2] is not None:
             lora_b_v = lora_b[2][:, self.kv_proj_shard_size *
                                  self.kv_shard_id:self.kv_proj_shard_size *
-                                 (self.kv_shard_id + 1)]
+                                 (self.kv_shard_id + 1), ]
         lora_b = [lora_b_q, lora_b_k, lora_b_v]
         return lora_b
 
@@ -923,10 +961,8 @@ def apply(self, x: torch.Tensor,
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
-            self.b_seq_start_tensor,
-            self.seq_length_tensor,
             self.indices,
-            self.batch_mlen_stage_lst,
+            self.indices_len,
             output,
             self.output_slices,
         )
@@ -935,11 +971,15 @@ def apply(self, x: torch.Tensor,
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is QKVParallelLinear and len(
-            packed_modules_list) == 3
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is QKVParallelLinear
+                and len(packed_modules_list) == 3)
 
 
 class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
@@ -952,10 +992,11 @@ def __init__(self, base_layer: RowParallelLinear) -> None:
         self.device = _get_lora_device(self.base_layer)
 
     def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
         self.lora_config = lora_config
         self.tp_rank = get_tensor_model_parallel_rank()
         self.lora_a_stacked = torch.zeros(
@@ -987,11 +1028,6 @@ def create_lora_weights(
         self.indices: torch.Tensor
         self.indices_len: List[int]
 
-        self.seq_length_tensor: torch.Tensor
-        self.b_seq_start_tensor: torch.Tensor
-        self.lora_index_tensor: torch.Tensor
-        self.batch_mlen_stage_lst: List[int]
-
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -1028,23 +1064,28 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
 
     def set_mapping(
-            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
-            sampler_indices_padded: torch.Tensor,
-            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
-            indices_len: List[int], seq_length_tensor: torch.Tensor,
-            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
+        self,
+        base_indices: torch.Tensor,
+        sampler_indices: torch.Tensor,
+        sampler_indices_padded: torch.Tensor,
+        embeddings_indices: torch.Tensor,
+        long_lora_indices: torch.Tensor,
+        indices_len: List[int],
+    ):
         self.indices = base_indices
         self.indices_len = indices_len
-        self.seq_length_tensor = seq_length_tensor
-        self.b_seq_start_tensor = b_seq_start_tensor
-        self.batch_mlen_stage_lst = batch_mlen_stage_lst
 
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
         # maybe we need not  restrict  range to [:batch_size]
-        _apply_lora_triton(x, self.lora_a_stacked, self.lora_b_stacked,
-                           self.b_seq_start_tensor, self.seq_length_tensor,
-                           self.indices, self.batch_mlen_stage_lst, output)
+        _apply_lora_triton(
+            x,
+            self.lora_a_stacked,
+            self.lora_b_stacked,
+            self.indices,
+            self.indices_len,
+            output,
+        )
         return output
 
     def forward(self, input_):
@@ -1087,14 +1128,18 @@ def forward(self, input_):
 
     @property
     def weight(self):
-        return self.base_layer.weight if hasattr(
-            self.base_layer, "weight") else self.base_layer.qweight
+        return (self.base_layer.weight if hasattr(self.base_layer, "weight")
+                else self.base_layer.qweight)
 
     @classmethod
     @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         return type(source_layer) is RowParallelLinear
 
 
@@ -1177,11 +1222,6 @@ def create_lora_weights(
         self.indices_len: List[int]
         self.indices_padded: torch.Tensor
 
-        self.seq_length_tensor: torch.Tensor
-        self.b_seq_start_tensor: torch.Tensor
-        self.lora_index_tensor: torch.Tensor
-        self.batch_mlen_stage_lst: List[int]
-
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -1207,17 +1247,17 @@ def set_lora(
                 shape[1], ] = embeddings_tensor
 
     def set_mapping(
-            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
-            sampler_indices_padded: torch.Tensor,
-            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
-            indices_len: List[int], seq_length_tensor: torch.Tensor,
-            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
+        self,
+        base_indices: torch.Tensor,
+        sampler_indices: torch.Tensor,
+        sampler_indices_padded: torch.Tensor,
+        embeddings_indices: torch.Tensor,
+        long_lora_indices: torch.Tensor,
+        indices_len: List[int],
+    ):
         self.indices = sampler_indices
         self.indices_padded = sampler_indices_padded
         self.indices_len = indices_len
-        self.seq_length_tensor = seq_length_tensor
-        self.b_seq_start_tensor = b_seq_start_tensor
-        self.batch_mlen_stage_lst = batch_mlen_stage_lst
 
     def _get_logits(
         self,
@@ -1255,16 +1295,22 @@ def _get_logits(
                            neginf=float("-inf")))
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
-               lora_logits.shape[1]] = lora_logits
+               lora_logits.shape[1], ] = lora_logits
 
-        batch_mlen_stage_lst = self.batch_mlen_stage_lst.copy()
         # LogitsProcessorWithLoRA always using bgmv
-        batch_mlen_stage_lst[2] = False
-        _apply_lora_triton(hidden_states, self.lora_a_stacked,
-                           self.lora_b_stacked, self.b_seq_start_tensor,
-                           self.seq_length_tensor,
-                           self.indices[:self.indices_len[1]],
-                           batch_mlen_stage_lst, logits)
+        # sampler_indices
+        sampler_indices = self.indices_len[1]
+        is_prefilling = False
+        add_lora_triton(
+            logits,
+            hidden_states,
+            self.lora_a_stacked,
+            self.lora_b_stacked,
+            self.indices[:sampler_indices],
+            0,
+            1.0,
+            is_prefilling,
+        )
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
 
@@ -1274,9 +1320,13 @@ def forward(self, *args, **kwargs):
         return type(self.base_layer).forward(self, *args, **kwargs)
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
 
@@ -1310,9 +1360,8 @@ def create_lora_weights(
         lora_config: LoRAConfig,
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
-        scaling_factors = list(
-            lora_config.long_lora_scaling_factors
-        ) if lora_config.long_lora_scaling_factors else []
+        scaling_factors = (list(lora_config.long_lora_scaling_factors)
+                           if lora_config.long_lora_scaling_factors else [])
         base_scaling_factor = (self.base_layer.scaling_factor if isinstance(
             self.base_layer, LinearScalingRotaryEmbedding) else 1.0)
         scaling_factors = sorted(
@@ -1340,11 +1389,14 @@ def set_lora(
         ...
 
     def set_mapping(
-            self, base_indices: torch.Tensor, sampler_indices: torch.Tensor,
-            sampler_indices_padded: torch.Tensor,
-            embeddings_indices: torch.Tensor, long_lora_indices: torch.Tensor,
-            indices_len: List[int], seq_length_tensor: torch.Tensor,
-            b_seq_start_tensor: torch.Tensor, batch_mlen_stage_lst: List[int]):
+        self,
+        base_indices: torch.Tensor,
+        sampler_indices: torch.Tensor,
+        sampler_indices_padded: torch.Tensor,
+        embeddings_indices: torch.Tensor,
+        long_lora_indices: torch.Tensor,
+        indices_len: List[int],
+    ):
         self.long_lora_indices = long_lora_indices
         self.indices_len = indices_len
 
@@ -1358,19 +1410,24 @@ def forward(
             positions,
             query,
             key,
-            offsets=self.long_lora_indices[:self.indices_len[4]])
+            offsets=self.long_lora_indices[:self.indices_len[4]],
+        )
 
     @property
     def scaling_factor_to_offset(self) -> Dict[float, int]:
         return self.base_layer.scaling_factor_to_offset
 
     @classmethod
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: List,
-                          model_config: Optional[PretrainedConfig]) -> bool:
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: List,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-        return type(source_layer) is LinearScalingRotaryEmbedding or type(
-            source_layer) is RotaryEmbedding
+        return (type(source_layer) is LinearScalingRotaryEmbedding
+                or type(source_layer) is RotaryEmbedding)
 
     def extra_repr(self) -> str:
         return self.base_layer.extra_repr()
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b6c47e599e81..f817bf65ec96 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -12,19 +12,21 @@
 
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import (BaseLayerWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLora,
-                              LoRAMapping)
+from vllm.lora.layers import (
+    BaseLayerWithLoRA,
+    LinearScalingRotaryEmbeddingWithLora,
+    LoRAMapping,
+)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.utils import (from_layer, from_layer_logits_processor,
-                             parse_fine_tuned_lora_name, replace_submodule)
+from vllm.lora.utils import (
+    from_layer,
+    from_layer_logits_processor,
+    parse_fine_tuned_lora_name,
+    replace_submodule,
+)
 from vllm.utils import LRUCache, is_pin_memory_available
 
-# NOTE: The number of _MAX_BATCHS derived from worker's model_runner.
-# _BATCH_SIZES_TO_CAPTURE.It needs to be updated if _BATCH_SIZES_TO_CAPTURE
-# is changed.
-
-_MAX_BATCHS = 256 + 16  #max(_BATCH_SIZES_TO_CAPTURE)+16
+from vllm.lora import punica
 
 logger = init_logger(__name__)
 
@@ -34,6 +36,7 @@
 @dataclass
 class LongContextLoRAContext:
     """Context for lora adapters that support long context."""
+
     # The scaling factors to support long context lora fine tuned models.
     scaling_factors: List[float]
     # dimension to apply rotary embedding.
@@ -51,7 +54,7 @@ def convert_mapping(
     extra_vocab_size: int,
     long_lora_context: Optional[LongContextLoRAContext] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor], List[int]]:
+           Optional[torch.Tensor], List[int], ]:
     """Converts LoRAMapping to index tensors.
 
     Args:
@@ -89,7 +92,7 @@ def convert_mapping(
     """
     index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
     embedding_indices = index_mapping_indices.copy()
-    lora_indices = mapping.batch_mapping.copy()
+    lora_indices = index_mapping_indices.copy()
     long_lora_offsets: Optional[torch.Tensor] = None
     if long_lora_context:
         long_lora_offsets = torch.zeros(len(index_mapping_indices),
@@ -99,27 +102,24 @@ def convert_mapping(
         lora_index_to_id.index(x) if x > 0 else -1
         for x in mapping.prompt_mapping
     ]
-    token_lora_idx = None
+    lora_idx = None
     for i in range(len(index_mapping_indices)):
         # TODO index can be slow. optimize
-        token_lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                          if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[
-            i] = token_lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
         if long_lora_context:
             assert long_lora_offsets is not None
             lora_offset: int = long_lora_context.offsets_by_lora_id.get(
                 index_mapping_indices[i], 0)
             long_lora_offsets[i] = lora_offset
-    # every seq lora_id
-    for i in range(len(lora_indices)):
-        lora_indices[i] = (lora_index_to_id.index(lora_indices[i])
-                           if lora_indices[i] > 0 else -1)
 
     indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices, embedding_indices
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
     ]
-    base_indices = torch.tensor(lora_indices, dtype=torch.long, device="cuda")
     if long_lora_context:
         assert long_lora_offsets is not None
         indices_list.append(long_lora_offsets)
@@ -128,33 +128,39 @@ def convert_mapping(
                                          device="cuda",
                                          dtype=torch.long)
     embeddings_indices = torch.stack([
-        indices[1] * extra_vocab_size,
-        indices[1] * (vocab_size + extra_vocab_size)
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
     ])
     embeddings_indices[embeddings_indices == -1] = max_loras - 1
-
+    base_indices = indices[1]
     sampler_indices = prompt_mapping_tensor
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = (
-        torch.arange(
-            0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
-        (sampler_indices_padded * len(sampler_indices_padded)))
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
     long_lora_indices = None
     long_lora_indices_len: Optional[int] = None
     if long_lora_context:
-        long_lora_indices = indices[2]
+        long_lora_indices = indices[3]
         long_lora_indices_len = long_lora_indices.shape[-1]
     # Contain length of indices tensors. Used to index into each tensor.
     indices_len = [
-        base_indices.shape[-1], sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
     ]
     if long_lora_indices_len is not None:
         indices_len.append(long_lora_indices_len)
-
-    return (base_indices, sampler_indices, sampler_indices_padded,
-            embeddings_indices, long_lora_indices, indices_len)
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        long_lora_indices,
+        indices_len,
+    )
 
 
 def get_lora_id():
@@ -202,8 +208,8 @@ def clone(self, lora_model_id: int) -> "LoRAModel":
 
     @property
     def extra_vocab_size(self) -> int:
-        return max(lora.extra_vocab_size
-                   for lora in self.loras.values()) if self.loras else 0
+        return (max(lora.extra_vocab_size
+                    for lora in self.loras.values()) if self.loras else 0)
 
     def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
         """Get LoRA for a given module by name"""
@@ -244,9 +250,14 @@ def from_lora_tensors(
                         if pin_memory:
                             lora_embeddings_tensor = (
                                 lora_embeddings_tensor.pin_memory())
-                loras[module_name] = LoRALayerWeights(module_name, rank,
-                                                      lora_alpha, None, None,
-                                                      lora_embeddings_tensor)
+                loras[module_name] = LoRALayerWeights(
+                    module_name,
+                    rank,
+                    lora_alpha,
+                    None,
+                    None,
+                    lora_embeddings_tensor,
+                )
             if is_lora_a:
                 loras[module_name].lora_a = tensor.to(device=device,
                                                       dtype=dtype).t()
@@ -257,9 +268,9 @@ def from_lora_tensors(
                 loras[module_name].lora_b = tensor.to(device=device,
                                                       dtype=dtype).t()
                 assert embedding_padding_modules is not None
-                if any(name in module_name
-                       for name in embedding_padding_modules
-                       ) and target_embedding_padding is not None:
+                if (any(name in module_name
+                        for name in embedding_padding_modules)
+                        and target_embedding_padding is not None):
                     lora_b = loras[module_name].lora_b
                     assert target_embedding_padding >= lora_b.shape[1]
                     addition = target_embedding_padding - lora_b.shape[1]
@@ -288,7 +299,7 @@ def from_local_checkpoint(
         embedding_padding_modules: Optional[List[str]] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
-        
+
         Args:
             lora_dir: The local path that has lora data.
             expected_lora_modules: Name of modules that are expected to be
@@ -411,17 +422,16 @@ def __init__(
                                               self.max_num_batched_tokens,
                                               dtype=torch.long,
                                               device="cuda")
-
         self.long_lora_indices = torch.empty(self.max_num_batched_tokens,
                                              dtype=torch.long,
                                              device="cuda")
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
-        # 4 is the number of indicies tensors defined above
+        # 5 is the number of indicies tensors.
         # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices
-        self.indices_len: List[Optional[int]] = [None] * 4
+        # embeddings_indices,prefilling or decoding
+        self.indices_len: List[Optional[int]] = [None] * 5
 
         self.model: nn.Module = model
         if hasattr(self.model, "supported_lora_modules"):
@@ -439,21 +449,7 @@ def __init__(
         # Dict instead of a Set for compatibility with LRUCache.
         self._active_loras: Dict[int, None] = {}
         self._last_mapping: Optional[LoRAMapping] = None
-
-        # triton kernel mapping
-        self.seq_length_tensor = torch.empty(_MAX_BATCHS,
-                                             dtype=torch.long,
-                                             device="cuda")
-        self.b_seq_start_tensor = torch.zeros(_MAX_BATCHS,
-                                              dtype=torch.long,
-                                              device="cuda")
-
-        # element contains batch_size, max_length, 0 or 1. Use 1 for the
-        # prefilling stage and 0 for the decoding stage.The reason for
-        # distinguishing between the prefilling and decoding stage is that
-        # we had implemented bgmv, it can be utilized during the decoding
-        # stage.
-        self.batch_mlen_stage_lst = [-1] * 3
+        self._convert_flag = True
         self._create_lora_modules()
         self.model.lora_manager = self
 
@@ -477,7 +473,9 @@ def activate_lora(
             return False
         first_free_slot = next(
             ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id)
-             if lora_id is None), None)
+             if lora_id is None),
+            None,
+        )
         if first_free_slot is None:
             raise ValueError("No free lora slots")
         index, _ = first_free_slot
@@ -490,8 +488,12 @@ def activate_lora(
             module_lora = lora_model.get_lora(module_name)
             if module_lora:
                 module_lora.optimize()
-                module.set_lora(index, module_lora.lora_a, module_lora.lora_b,
-                                module_lora.embeddings_tensor)
+                module.set_lora(
+                    index,
+                    module_lora.lora_a,
+                    module_lora.lora_b,
+                    module_lora.embeddings_tensor,
+                )
             else:
                 module.reset_lora(index)
         return True
@@ -518,7 +520,7 @@ def _set_long_lora_context(self, lora: LoRAModel):
         if lora.scaling_factor is None:
             return
 
-        if (lora.scaling_factor not in self.scaling_factor_to_offset):
+        if lora.scaling_factor not in self.scaling_factor_to_offset:
             raise ValueError(f"Long LoRA scaling factor {lora.scaling_factor}"
                              " has not been initialized.")
 
@@ -536,7 +538,11 @@ def add_lora(self, lora: LoRAModel) -> bool:
         logger.debug(
             "Adding lora. Model id: %d, "
             "int id: %d, "
-            "scaling factor: %s", lora.id, lora.id, lora.scaling_factor)
+            "scaling factor: %s",
+            lora.id,
+            lora.id,
+            lora.scaling_factor,
+        )
         if lora.id not in self._registered_loras:
             if len(self._registered_loras) >= self.capacity:
                 raise RuntimeError("No free LoRA slots.")
@@ -554,12 +560,21 @@ def remove_lora(self, lora_id: int) -> bool:
 
     # TODO see if this can be vectorized
     def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
-        (base_indices, sampler_indices, sampler_indices_padded,
-         embeddings_indices, long_lora_offsets_tensor,
-         indices_len) = convert_mapping(mapping, self.lora_index_to_id,
-                                        self.lora_slots + 1, self.vocab_size,
-                                        self.lora_config.lora_extra_vocab_size,
-                                        self.long_lora_context)
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            self.lora_index_to_id,
+            self.lora_slots + 1,
+            self.vocab_size,
+            self.lora_config.lora_extra_vocab_size,
+            self.long_lora_context,
+        )
         self.base_indices[:base_indices.shape[0]].copy_(base_indices)
         self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
         self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
@@ -573,25 +588,11 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         else:
             self.long_lora_indices.zero_()
         # Maintain the reference
-        self.indices_len[:] = indices_len
-
-        # Mapping for sgmv kernel
-        if mapping.seq_lens and mapping.batch_mapping:
-            batchs = len(mapping.seq_lens)
-            seq_length_tensor = torch.tensor(mapping.seq_lens,
-                                             dtype=torch.long,
-                                             device="cuda")
-            self.seq_length_tensor[:batchs].copy_(seq_length_tensor)
-            temp_tensor = torch.cumsum(seq_length_tensor,
-                                       dim=0,
-                                       dtype=seq_length_tensor.dtype)
-            self.b_seq_start_tensor[1:temp_tensor.size(0) +
-                                    1].copy_(temp_tensor)
-
-            self.batch_mlen_stage_lst[:] = [
-                batchs,
-                max(mapping.seq_lens), 1 if mapping.is_prefilling else 0
-            ]
+        self.indices_len[:] = indices_len + [int(mapping.is_prefilling)]
+        #
+        if mapping.is_prefilling:
+            punica.reset_params_cache()
+            punica._compute_params(self.base_indices[:base_indices.shape[0]])
 
     def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
         if self._last_mapping != lora_mapping:
@@ -619,52 +620,66 @@ def _create_lora_modules(self):
             parts = module_name.split(".")[-1]
             packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
             new_module = replace_submodule(
-                self.model, module_name,
-                from_layer(module, self.lora_slots, self.lora_config,
-                           packed_moduled_lst, self.model.config))
+                self.model,
+                module_name,
+                from_layer(
+                    module,
+                    self.lora_slots,
+                    self.lora_config,
+                    packed_moduled_lst,
+                    self.model.config,
+                ),
+            )
             # LinearScalingRotaryEmbeddingWithLora is used to handle
             # long context lora. Register relevant metadata.
             if isinstance(new_module, LinearScalingRotaryEmbeddingWithLora):
                 self.long_lora_context = LongContextLoRAContext(
                     new_module.scaling_factors, new_module.rotary_dim)
-                self.scaling_factor_to_offset = \
-                    new_module.scaling_factor_to_offset
+                self.scaling_factor_to_offset = (
+                    new_module.scaling_factor_to_offset)
             # (yard1): TODO make this more robust
             if "lm_head" in module_name:
                 logits_processor_module = self.model.get_submodule(
                     "logits_processor")
                 new_module = replace_submodule(
-                    self.model, "logits_processor",
-                    from_layer_logits_processor(logits_processor_module,
-                                                module, self.lora_slots,
-                                                self.lora_config,
-                                                self.model.config))
+                    self.model,
+                    "logits_processor",
+                    from_layer_logits_processor(
+                        logits_processor_module,
+                        module,
+                        self.lora_slots,
+                        self.lora_config,
+                        self.model.config,
+                    ),
+                )
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
-            new_module.set_mapping(self.base_indices, self.sampler_indices,
-                                   self.sampler_indices_padded,
-                                   self.embeddings_indices,
-                                   self.long_lora_indices, self.indices_len,
-                                   self.seq_length_tensor,
-                                   self.b_seq_start_tensor,
-                                   self.batch_mlen_stage_lst)
+            new_module.set_mapping(
+                self.base_indices,
+                self.sampler_indices,
+                self.sampler_indices_padded,
+                self.embeddings_indices,
+                self.long_lora_indices,
+                self.indices_len,
+            )
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA)
         self.modules[module_name] = module
 
     def create_dummy_lora(
-            self,
-            lora_id: int,
-            rank: int,
-            scaling_factor: Optional[float],
-            embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel:
+        self,
+        lora_id: int,
+        rank: int,
+        scaling_factor: Optional[float],
+        embedding_modules: Optional[Dict[str, str]] = None,
+    ) -> LoRAModel:
         """Create zero-initialized LoRAModel for warmup."""
         model = LoRAModel(lora_id, rank, {}, scaling_factor)
         for module_name, module in self.model.named_modules():
-            if not self._match_target_modules(module_name) or not isinstance(
-                    module, BaseLayerWithLoRA) or isinstance(
-                        module, LinearScalingRotaryEmbeddingWithLora):
+            if (not self._match_target_modules(module_name)
+                    or not isinstance(module, BaseLayerWithLoRA) or isinstance(
+                        module, LinearScalingRotaryEmbeddingWithLora)):
                 continue
             parts = module_name.split(".")
             if module_name not in self.packed_modules:
@@ -674,9 +689,9 @@ def create_dummy_lora(
                                  self.lora_config.lora_extra_vocab_size if
                                  hasattr(module.base_layer, "org_vocab_size")
                                  else module.base_layer.weight.shape[1])
-                    output_dim = module.base_layer.embedding_dim if hasattr(
-                        module.base_layer,
-                        "embedding_dim") else module.base_layer.weight.shape[0]
+                    output_dim = (module.base_layer.embedding_dim if hasattr(
+                        module.base_layer, "embedding_dim") else
+                                  module.base_layer.weight.shape[0])
                     embeddings_tensor_dim = (module.base_layer.embedding_dim if
                                              hasattr(module.base_layer,
                                                      "embedding_dim") else
@@ -688,7 +703,8 @@ def create_dummy_lora(
                         rank,
                         module.lora_a_stacked.dtype,
                         "cpu",
-                        embeddings_tensor_dim=embeddings_tensor_dim)
+                        embeddings_tensor_dim=embeddings_tensor_dim,
+                    )
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
@@ -722,7 +738,8 @@ def _match_target_modules(self, module_name: str):
         return any(
             re.match(
                 r".*\.{target_module}$".format(target_module=target_module),
-                module_name) or target_module == module_name
+                module_name,
+            ) or target_module == module_name
             for target_module in self.supported_lora_modules)
 
     def _register_packed_modules(self, module_full_name: str) -> None:
@@ -797,7 +814,11 @@ def add_lora(self, lora: LoRAModel) -> bool:
         logger.debug(
             "Adding lora. Model id: %d, "
             "int id: %d, "
-            "scaling factor: %s", lora.id, lora.id, lora.scaling_factor)
+            "scaling factor: %s",
+            lora.id,
+            lora.id,
+            lora.scaling_factor,
+        )
         if lora.id not in self._registered_loras:
             self._add_lora(lora)
             was_added = True
@@ -811,8 +832,8 @@ def activate_lora(
         self,
         lora_id: int,
     ) -> bool:
-        if lora_id not in self._active_loras and len(
-                self._active_loras) >= self.lora_slots:
+        if (lora_id not in self._active_loras
+                and len(self._active_loras) >= self.lora_slots):
             self._active_loras.remove_oldest()
         result = super().activate_lora(lora_id)
         # We always touch to update the LRU cache order
@@ -827,13 +848,14 @@ def remove_oldest_lora(self) -> bool:
 
 
 def create_lora_manager(
-        model: nn.Module,
-        max_num_seqs: int,
-        max_num_batched_tokens: int,
-        vocab_size: int,
-        lora_config: LoRAConfig,
-        lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager,
-        **kwargs) -> LoRAModelManager:
+    model: nn.Module,
+    max_num_seqs: int,
+    max_num_batched_tokens: int,
+    vocab_size: int,
+    lora_config: LoRAConfig,
+    lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager,
+    **kwargs,
+) -> LoRAModelManager:
     """Create a LoRA adapter for a given model."""
     if not hasattr(model, "supported_lora_modules"):
         raise ValueError(f"Model {type(model)} is not supported for LoRA.")
@@ -843,5 +865,6 @@ def create_lora_manager(
         max_num_batched_tokens=max_num_batched_tokens,
         vocab_size=vocab_size,
         lora_config=lora_config,
-        **kwargs)
+        **kwargs,
+    )
     return lora_manager
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index ba387fc2010f..7366edf81491 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -1,9 +1,7 @@
 # Based on code from https://github.com/punica-ai/punica
 
-from typing import Optional
-
+from typing import Optional, Dict, Tuple
 import torch
-
 from vllm.lora.ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
@@ -15,12 +13,53 @@
 def _raise_import_error(e):
     if torch.cuda.get_device_capability() < (8, 0):
         raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0") from e
+            "punica LoRA kernels require compute capability >= 8.0"
+        ) from e
     else:
         raise ImportError(
             "punica LoRA kernels could not be imported. If you built vLLM "
             "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set.") from e
+            "was set."
+        ) from e
+
+
+_PARAMS_CACHE: Dict[int, Tuple] = {}
+
+
+def _compute_params(token_lora_tensor: torch.Tensor):
+    pointer = token_lora_tensor.data_ptr()
+    if pointer not in _PARAMS_CACHE:
+        lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+            token_lora_tensor, return_counts=True
+        )
+        cum_result = torch.cumsum(seq_length_tensor, dim=0)
+        b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+        b_seq_start_tensor[1:].copy_(cum_result[:-1])
+        max_length = seq_length_tensor.max().item()
+        batch_size = lora_indices_tensor.size(0)
+        _PARAMS_CACHE[pointer] = (
+            b_seq_start_tensor,
+            seq_length_tensor,
+            lora_indices_tensor,
+            batch_size,
+            max_length,
+        )
+    return _PARAMS_CACHE[pointer]
+
+
+def reset_params_cache():
+    """At the beginning of the prefilling stage, we need  clear the
+    cache explicitly
+    """
+    _PARAMS_CACHE.clear()
+
+
+def _get_prefilling_params(
+    token_lora_tensor: torch.Tensor, cache_clear: bool = False
+):
+    if cache_clear:
+        reset_params_cache()
+    return _compute_params(token_lora_tensor)
 
 
 def bgmv(
@@ -147,12 +186,13 @@ def add_lora(
         # We set the buffer to be float32 by default to avoid
         # numerical inaccuracies that would otherwise happen
         # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
+        buffer = torch.zeros(
+            (x.size(0), r), dtype=torch.float32, device=x.device
+        )
     punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-    punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
-                                 scale)
+    punica_kernels.dispatch_bgmv(
+        y, buffer, wb_t_all, indicies, layer_idx, scale
+    )
 
 
 def add_lora_slice(
@@ -200,12 +240,11 @@ def add_lora_slice(
 
     r = wb_t_all.size(-1)
     if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
+        # We set the buffer to be float32 by default ,refer to:
+        # https://github.com/triton-lang/triton/issues/1387
+        buffer = torch.zeros(
+            (x.size(0), r), dtype=torch.float32, device=x.device
+        )
     punica_kernels.dispatch_bgmv_low_level(
         buffer,
         x,
@@ -230,269 +269,175 @@ def add_lora_slice(
     )
 
 
-def add_lora_triton(
+def add_shrink_triton(
     y: torch.Tensor,
     x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    b_seq_start_tensor: torch.Tensor,
-    seq_length_tensor: torch.Tensor,
+    w_t_all: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batch_size: int,
-    max_length: int,
     layer_idx: int,
     scale: float,
     is_prefilling: bool,
-    *,
-    buffer: Optional[torch.Tensor] = None,
+    cache_clear: bool = False,
 ):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[lora_index_tensor[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[lora_index_tensor[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-    Args:
-        y (torch.Tensor):  (batch_size, output_dim).Will be changed in-place.
-        x (torch.Tensor):  (batch_size, hidden_dim)
-        wa_t_all (torch.Tensor):  (num_loras, lora_rank, hidden_dim)
-        wb_t_all (torch.Tensor): (num_loras, output_dim, lora_rank)
-        b_seq_start_tensor (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g.,if the sequence length is [4, 6], it is
-            [0, 4]. Used only during the prefilling stage.
-        seq_length_tensor (torch.Tensor): batch_size,). record the sequence
-            length of the sequences in the batch. Used only during the
-            prefilling stage.
-        lora_index_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
-        batch_size (int): batch size. Used only during the prefilling stage.
-        max_length (int):  maximum seq length in the batch.Used only during the
-            prefilling stage.
-        layer_idx (int): Layer index of LoRA weights.
-        scale (float):  Scaling factor.
-        is_prefilling (bool): True indicates the prefilling stage, while False
-        indicates the decoding stage."
-        buffer (Optional[torch.Tensor], optional): (batch_size,rank)
-    """
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default ,refer to:
-        # https://github.com/triton-lang/triton/issues/1387
-
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
     if is_prefilling:
-        _lora_sgmv(
-            y,
+        (
+            b_seq_start_tensor,
+            seq_length_tensor,
+            last_lora_indices_tensor,
+            batch_size,
+            max_length,
+        ) = _get_prefilling_params(lora_indices_tensor, cache_clear)
+        sgmv_shrink(
             x,
-            wa_t_all,
-            wb_t_all,
+            w_t_all,
+            y,
             b_seq_start_tensor,
             seq_length_tensor,
-            lora_indices_tensor,
+            last_lora_indices_tensor,
             batch_size,
             max_length,
-            layer_idx,
             scale,
-            buffer=buffer,
         )
     else:
-        _lora_bgmv(
-            y,
-            x,
-            wa_t_all,
-            wb_t_all,
-            lora_indices_tensor,
-            layer_idx,
-            scale,
-            buffer=buffer,
-        )
+        bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale)
 
 
-def _lora_sgmv(
+def add_expand_triton(
     y: torch.Tensor,
     x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    b_seq_start_tensor: torch.Tensor,
-    seq_length_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batch_size: int,
-    max_length: int,
-    layer_idx: int,
-    scale: float,
-    buffer: torch.Tensor,
-):
-    sgmv_shrink(
-        x,
-        wa_t_all,
-        buffer,
-        b_seq_start_tensor,
-        seq_length_tensor,
-        lora_indices_tensor,
-        batch_size,
-        max_length,
-        scale,
-    )
-    sgmv_expand(
-        buffer,
-        wb_t_all,
-        y,
-        b_seq_start_tensor,
-        seq_length_tensor,
-        lora_indices_tensor,
-        batch_size,
-        max_length,
-        add_inputs=True,
-    )
-
-
-def _lora_bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
+    w_t_all: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
     layer_idx: int,
-    scale: float,
-    buffer: torch.Tensor,
+    is_prefilling: bool,
+    add_input: bool = True,
+    cache_clear: bool = False,
 ):
-    bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, scale)
-    bgmv_expand(buffer, wb_t_all, y, lora_indices_tensor, add_inputs=True)
+    if is_prefilling:
+        (
+            b_seq_start_tensor,
+            seq_length_tensor,
+            last_lora_indices_tensor,
+            batch_size,
+            max_length,
+        ) = _get_prefilling_params(lora_indices_tensor, cache_clear)
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            b_seq_start_tensor,
+            seq_length_tensor,
+            last_lora_indices_tensor,
+            batch_size,
+            max_length,
+            add_input,
+        )
+    else:
+        bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_inputs=add_input)
 
 
-def add_lora_triton_slice(
+def add_expand_slice_triton(
     y: torch.Tensor,
     x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    b_seq_start_tensor: torch.Tensor,
-    seq_length_tensor: torch.Tensor,
+    w_t_all: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batch_size: int,
-    max_length: int,
     layer_idx: int,
-    scale: float,
+    is_prefilling: bool,
     y_offset: int,
     y_slice_size: int,
-    is_prefilling: bool,
-    *,
-    buffer: Optional[torch.Tensor] = None,
+    add_input: bool = True,
+    cache_clear: bool = False,
 ):
-    """
-    Same as `add_lora_triton` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
-    """
-    # try:
-    #     import vllm._punica_C as punica_kernels
-    # except ImportError as e:
-    #     _raise_import_error(e)
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
     if is_prefilling:
-        _lora_sgmv_nslice(
-            y,
+        (
+            b_seq_start_tensor,
+            seq_length_tensor,
+            last_lora_indices_tensor,
+            batch_size,
+            max_length,
+        ) = _get_prefilling_params(lora_indices_tensor, cache_clear)
+        sgmv_expand_slice(
             x,
-            wa_t_all,
-            wb_t_all,
+            w_t_all,
+            y,
             b_seq_start_tensor,
             seq_length_tensor,
-            lora_indices_tensor,
+            last_lora_indices_tensor,
             batch_size,
             max_length,
-            layer_idx,
-            scale,
             y_offset,
             y_slice_size,
-            buffer,
+            add_input,
         )
     else:
-        _lora_bgmv_nslice(
-            y,
+        bgmv_expand_slice(
             x,
-            wa_t_all,
-            wb_t_all,
+            w_t_all,
+            y,
             lora_indices_tensor,
-            layer_idx,
-            scale,
             y_offset,
             y_slice_size,
-            buffer,
+            add_inputs=add_input,
         )
 
 
-def _lora_sgmv_nslice(
+def add_lora_triton(
     y: torch.Tensor,
     x: torch.Tensor,
     wa_t_all: torch.Tensor,
     wb_t_all: torch.Tensor,
-    b_seq_start_tensor: torch.Tensor,
-    seq_length_tensor: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batch_size: int,
-    max_length: int,
     layer_idx: int,
     scale: float,
-    y_offset: int,
-    y_slice_size: int,
-    buffer,
+    is_prefilling: bool,
+    y_offset: Optional[int] = None,
+    y_slice_size: Optional[int] = None,
+    *,
+    buffer: Optional[torch.Tensor] = None,
+    cache_clear: bool = False,
 ):
-    sgmv_shrink(
+    """
+    Same as `add_lora_triton` but you can operate on slices of y.
+    Pass whole y, define y_offset and y_slice_size.
+    """
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default ,refer to:
+        # https://github.com/triton-lang/triton/issues/1387
+        buffer = torch.zeros(
+            (x.size(0), r), dtype=torch.float32, device=x.device
+        )
+
+    add_shrink_triton(
+        buffer,
         x,
         wa_t_all,
-        buffer,
-        b_seq_start_tensor,
-        seq_length_tensor,
         lora_indices_tensor,
-        batch_size,
-        max_length,
+        0,
         scale,
+        is_prefilling,
+        cache_clear=cache_clear,
     )
-    sgmv_expand_slice(
-        buffer,
-        wb_t_all,
-        y,
-        b_seq_start_tensor,
-        seq_length_tensor,
-        lora_indices_tensor,
-        batch_size,
-        max_length,
-        y_offset,
-        y_slice_size,
-        add_inputs=True,
-    )
-
-
-def _lora_bgmv_nslice(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-    y_offset: int,
-    y_slice_size: int,
-    buffer,
-):
-    bgmv_shrink(x, wa_t_all, buffer, lora_indices_tensor, scale)
-    bgmv_expand_slice(
-        buffer,
-        wb_t_all,
-        y,
-        lora_indices_tensor,
-        y_offset,
-        y_slice_size,
-        add_inputs=True,
-    )
+    if y_offset is None and y_slice_size is None:
+        add_expand_triton(
+            y,
+            buffer,
+            wb_t_all,
+            lora_indices_tensor,
+            0,
+            is_prefilling,
+            add_input=True,
+            cache_clear=cache_clear,
+        )
+    else:
+        add_expand_slice_triton(
+            y,
+            buffer,
+            wb_t_all,
+            lora_indices_tensor,
+            0,
+            is_prefilling,
+            y_offset,
+            y_slice_size,
+            add_input=True,
+            cache_clear=cache_clear,
+        )
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 476e9ba3bb46..3a5bc6c78515 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,7 +1,5 @@
-import gc
 import time
 import warnings
-from collections import defaultdict
 from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
 
 import numpy as np
@@ -13,17 +11,16 @@
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
-from vllm.distributed.parallel_state import graph_capture
+from vllm.distributed.communication_op import graph_capture
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
+                           SequenceGroupMetadata)
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                         is_pin_memory_available, make_tensor_with_pad)
 
@@ -37,7 +34,6 @@
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
     _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
 ]
-_NUM_WARMUP_ITERS = 2
 
 
 class ModelInput(NamedTuple):
@@ -48,7 +44,7 @@ class ModelInput(NamedTuple):
     query_lens: List[int]
     lora_mapping: Optional[LoRAMapping]
     lora_requests: Set[LoRARequest]
-    multi_modal_kwargs: Dict[str, torch.Tensor]
+    multi_modal_input: Optional[torch.Tensor]
     slot_mapping: torch.Tensor
     num_prefill_tokens: int
     num_decode_tokens: int
@@ -64,7 +60,7 @@ def empty(cls, device):
             query_lens=[],
             lora_mapping=None,
             lora_requests=set(),
-            multi_modal_kwargs={},
+            multi_modal_input=None,
             slot_mapping=torch.empty(0, device=device),
             num_prefill_tokens=0,
             num_decode_tokens=0,
@@ -126,16 +122,6 @@ def __init__(
             self.block_size,
         )
 
-        # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
-                .create_input_processor(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_processor = None
-
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
         # Set if the backend is flashinfer.
@@ -223,16 +209,6 @@ def save_sharded_state(
             max_size=max_size,
         )
 
-    def save_tensorized_model(
-        self,
-        tensorizer_config: TensorizerConfig,
-    ) -> None:
-        from vllm.model_executor.model_loader.loader import TensorizerLoader
-        TensorizerLoader.save_model(
-            self.model,
-            tensorizer_config=tensorizer_config,
-        )
-
     def get_max_block_per_batch(self) -> int:
         block_size = self.block_size
         return (self.max_seq_len_to_capture + block_size - 1) // block_size
@@ -266,8 +242,7 @@ def _prepare_model_input(
         context_lens: List[int] = []
         query_lens: List[int] = []
         block_tables: List[List[int]] = []
-        multi_modal_kwargs_list: Dict[str,
-                                      List[torch.Tensor]] = defaultdict(list)
+        multi_modal_input_list: List[torch.Tensor] = []
         decode_only = True
         num_prefills = 0
         num_prefill_tokens = 0
@@ -294,12 +269,6 @@ def _prepare_model_input(
         if len(seq_group_metadata_list) == 0:
             return ModelInput.empty(self.device)
 
-        if self.sliding_window is not None:
-            sliding_window_blocks = (self.sliding_window + self.block_size -
-                                     1) // self.block_size
-            block_aligned_sliding_window = \
-                sliding_window_blocks * self.block_size
-
         for seq_group_metadata in seq_group_metadata_list:
             seq_ids = list(seq_group_metadata.seq_data.keys())
             is_prompt = seq_group_metadata.is_prompt
@@ -340,30 +309,6 @@ def _prepare_model_input(
                                     and self.sliding_window is None
                                     and is_prompt)
 
-                # These are seq_len/context_len capped to the sliding window.
-                # They are passed to decode kernel.
-                # We still need original seq_len/context_len to compute slot
-                # mapping (and input position) below.
-                curr_sliding_window_blocks = None
-                sliding_seq_len = seq_len
-                sliding_context_len = context_len
-
-                # TODO(sang): This is a hack to make sliding window work with
-                # paged attn. We can remove it if we make paged attn kernel
-                # to properly handle slinding window attn.
-                if (self.sliding_window is not None and not is_prompt):
-                    curr_sliding_window_blocks = sliding_window_blocks
-                    if self.scheduler_config.use_v2_block_manager:
-                        # number of elements in last block
-                        suff_len = seq_len % self.block_size
-                        sliding_seq_len = min(
-                            seq_len, block_aligned_sliding_window + suff_len)
-                        if suff_len > 0:
-                            curr_sliding_window_blocks += 1
-                    else:
-                        sliding_seq_len = min(seq_len, self.sliding_window)
-                    sliding_context_len = sliding_seq_len - 1
-
                 # TODO(sang): Combine chunked prefill and prefix caching by
                 # only allowing multiple of block_size chunk size.
                 # NOTE: This only works for oooooooxxx style attention.
@@ -371,13 +316,6 @@ def _prepare_model_input(
                     assert computed_block_nums is not None
                     context_len = len(computed_block_nums) * self.block_size
                     tokens = tokens[context_len:]
-
-                    # need to think what to set it to when we have both sliding
-                    # window and prefix caching...
-                    assert self.sliding_window is None, \
-                        "Prefix caching is not supported with sliding window"
-                    sliding_context_len = context_len
-
                     if self.attn_backend.get_name() == "flash-attn":
                         # NOTE(woosuk): For flash-attn, the block table should
                         # include the entries for the incoming prefill tokens.
@@ -391,9 +329,14 @@ def _prepare_model_input(
                     if seq_group_metadata.block_tables is not None:
                         # chunked prefill or decode
                         block_table = seq_group_metadata.block_tables[seq_id]
-                        if curr_sliding_window_blocks is not None:
-                            block_table = block_table[
-                                -curr_sliding_window_blocks:]
+                        if self.sliding_window is not None:
+                            # chunked prefill doesn't support sliding window.
+                            assert (not self.scheduler_config.
+                                    chunked_prefill_enabled)
+                            sliding_window_blocks = (self.sliding_window //
+                                                     self.block_size)
+                            block_table = block_table[-sliding_window_blocks:]
+
                         if self.attn_backend.get_name() == "flashinfer":
                             paged_kv_indices.extend(block_table)
                             paged_kv_indptr.append(paged_kv_indptr[-1] +
@@ -411,9 +354,16 @@ def _prepare_model_input(
                     block_table = []
                 block_tables.append(block_table)
 
-                seq_lens.append(sliding_seq_len)
-                context_lens.append(sliding_context_len)
-                query_len = sliding_seq_len - sliding_context_len
+                # TODO(sang): This is a hack to make sliding window work with
+                # paged attn. We can remove it if we make paged attn kernel
+                # to properly handle slinding window attn.
+                if (self.sliding_window is not None and not is_prompt):
+                    seq_len = min(seq_len, self.sliding_window)
+                    context_len = seq_len - 1
+
+                seq_lens.append(seq_len)
+                context_lens.append(context_len)
+                query_len = seq_len - context_len
                 query_lens.append(query_len)
                 input_tokens.extend(tokens)
                 input_positions.extend(list(range(context_len, seq_len)))
@@ -430,7 +380,7 @@ def _prepare_model_input(
                         "seq_len: {}, context_len: {}, query_len: {}".format(
                             seq_len, context_len, query_len))
                     num_decode_tokens += query_len
-                    decode_seq_lens.append(sliding_seq_len)
+                    decode_seq_lens.append(seq_len)
 
                 if lora_id > 0:
                     lora_requests.add(seq_group_metadata.lora_request)
@@ -438,21 +388,14 @@ def _prepare_model_input(
                 lora_index_mapping += [lora_id] * query_len
                 lora_prompt_mapping.extend(
                     [lora_id] *
-                    (query_len if seq_group_metadata.sampling_params
+                    (seq_len -
+                     context_len if seq_group_metadata.sampling_params
                      and seq_group_metadata.sampling_params.prompt_logprobs
-                     is not None else 1))
+                     else 1))
 
-                mm_data = seq_group_metadata.multi_modal_data
-                if mm_data is not None:
-                    # Process multi-modal data
-                    if self.multi_modal_input_processor is None:
-                        raise ValueError(
-                            "Multi-modal inputs are only supported by "
-                            "vision language models.")
-
-                    mm_kwargs = self.multi_modal_input_processor(mm_data)
-                    for k, v in mm_kwargs.items():
-                        multi_modal_kwargs_list[k].append(v)
+                if seq_group_metadata.multi_modal_data:
+                    multi_modal_input_list.append(
+                        seq_group_metadata.multi_modal_data.data)
 
                 if _is_block_tables_empty(seq_group_metadata.block_tables):
                     # During memory profiling, the block tables are not
@@ -474,10 +417,9 @@ def _prepare_model_input(
                 start_idx = 0
                 if self.sliding_window is not None:
                     if is_prompt:
-                        assert self.scheduler_config.use_v2_block_manager \
-                            or context_len == 0, (
+                        assert context_len == 0, (
                             "Prefix caching is currently not supported with "
-                            "sliding window attention in V1 block manager")
+                            "sliding window attention")
                     # It is an optimization. When it is decoding, it is always
                     # 0. When prefill, we use it to not write slots to kv cache
                     # to save memory.
@@ -538,6 +480,29 @@ def _prepare_model_input(
             )
         assert max_query_len > 0, ("query_lens: {}".format(query_lens))
 
+        context_lens_tensor = torch.tensor(context_lens,
+                                           dtype=torch.int,
+                                           device=self.device)
+
+        if multi_modal_input_list:
+            assert self.vision_language_config, (
+                "Multi-modal inputs are only supported by "
+                "vision language models.")
+            multi_modal_input = torch.cat(multi_modal_input_list,
+                                          dim=0).to(self.device)
+        else:
+            multi_modal_input = None
+
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=self.device)
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=self.device)
+        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=self.device)
+
         seq_lens_tensor = torch.tensor(seq_lens,
                                        dtype=torch.int,
                                        device=self.device)
@@ -545,6 +510,11 @@ def _prepare_model_input(
                                     dtype=torch.int32,
                                     device=self.device)
 
+        torch.cumsum(query_lens_tensor,
+                     dim=0,
+                     dtype=query_start_loc.dtype,
+                     out=query_start_loc[1:])
+
         torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
@@ -597,21 +567,6 @@ def _prepare_model_input(
                 seq_start_loc=seq_start_loc,
                 data_type=kv_cache_dtype)
         else:
-            context_lens_tensor = torch.tensor(context_lens,
-                                               dtype=torch.int,
-                                               device=self.device)
-            query_lens_tensor = torch.tensor(query_lens,
-                                             dtype=torch.long,
-                                             device=self.device)
-            query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                          dtype=torch.int32,
-                                          device=self.device)
-
-            torch.cumsum(query_lens_tensor,
-                         dim=0,
-                         dtype=query_start_loc.dtype,
-                         out=query_start_loc[1:])
-
             attn_metadata = self.attn_backend.make_metadata(
                 num_prefills=num_prefills,
                 slot_mapping=slot_mapping_tensor,
@@ -630,18 +585,11 @@ def _prepare_model_input(
             )
 
         if self.lora_config:
-            lora_mapping = LoRAMapping(
-                lora_index_mapping,
-                lora_prompt_mapping,
-            )
+            lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping,
+                                       bool(attn_metadata.prefill_metadata))
         else:
             lora_mapping = None
 
-        multi_modal_kwargs = {
-            k: torch.cat(v, dim=0).to(self.device)
-            for k, v in multi_modal_kwargs_list.items()
-        }
-
         return ModelInput(
             input_tokens=input_tokens_tensor,
             input_positions=input_positions_tensor,
@@ -650,7 +598,7 @@ def _prepare_model_input(
             query_lens=query_lens,
             lora_mapping=lora_mapping,
             lora_requests=lora_requests,
-            multi_modal_kwargs=multi_modal_kwargs,
+            multi_modal_input=multi_modal_input,
             slot_mapping=slot_mapping_tensor,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
@@ -661,7 +609,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
+               Set[LoRARequest], LoRAMapping, torch.Tensor]:
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             # Prepare input tensors.
@@ -673,7 +621,7 @@ def prepare_input_tensors(
                 query_lens,
                 lora_mapping,
                 lora_requests,
-                multi_modal_kwargs,
+                multi_modal_input,
                 slot_mapping,
                 num_prefill_tokens,
                 num_decode_tokens,
@@ -690,7 +638,7 @@ def prepare_input_tensors(
                 sampling_metadata.selected_token_indices,
                 "lora_requests": lora_requests,
                 "lora_mapping": lora_mapping,
-                "multi_modal_kwargs": multi_modal_kwargs,
+                "multi_modal_input": multi_modal_input,
                 "num_prefill_tokens": num_prefill_tokens,
                 "num_decode_tokens": num_decode_tokens,
                 "slot_mapping": slot_mapping,
@@ -707,7 +655,7 @@ def prepare_input_tensors(
                 "selected_token_indices")
             lora_mapping = metadata_dict.pop("lora_mapping")
             lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
+            multi_modal_input = metadata_dict.pop("multi_modal_input")
             if metadata_dict:
                 attn_metadata = self.attn_backend.make_metadata(
                     **metadata_dict)
@@ -722,7 +670,7 @@ def prepare_input_tensors(
 
         return (input_tokens, input_positions, attn_metadata,
                 sampling_metadata, lora_requests, lora_mapping,
-                multi_modal_kwargs)
+                multi_modal_input)
 
     @torch.inference_mode()
     def execute_model(
@@ -731,7 +679,7 @@ def execute_model(
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
         (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests, lora_mapping, multi_modal_kwargs
+         lora_requests, lora_mapping, multi_modal_input
          ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         if self.lora_config:
@@ -745,14 +693,15 @@ def execute_model(
             model_executable = self.graph_runners[graph_batch_size]
         else:
             model_executable = self.model
-
-        hidden_states = model_executable(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            **multi_modal_kwargs,
-        )
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+        hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
@@ -808,24 +757,16 @@ def profile_run(self) -> None:
         # To exercise the worst scenario for GPU memory consumption,
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
-        model_config = self.model_config
-        vlm_config = self.vision_language_config
-
-        if vlm_config:
+        if self.vision_language_config:
             max_num_seqs = min(
                 max_num_seqs,
-                int(max_num_batched_tokens / vlm_config.image_feature_size))
+                int(max_num_batched_tokens /
+                    self.vision_language_config.image_feature_size))
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
-
-            if vlm_config is None:
-                seq_data = SequenceData([0] * seq_len)
-                dummy_multi_modal_data = None
-            else:
-                seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
-                    .dummy_data_for_profiling(seq_len, model_config, vlm_config)
-
+            seq_data, fake_multi_modal_input = _prepare_fake_inputs(
+                seq_len, self.vision_language_config)
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
@@ -834,7 +775,7 @@ def profile_run(self) -> None:
                 block_tables=None,
                 lora_request=dummy_lora_requests_per_seq[group_id]
                 if dummy_lora_requests_per_seq else None,
-                multi_modal_data=dummy_multi_modal_data,
+                multi_modal_data=fake_multi_modal_input,
             )
             seqs.append(seq)
 
@@ -906,10 +847,6 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
 
-        # Prepare buffer for outputs. These will be reused for all batch sizes.
-        # It will be filled after the first graph capture.
-        hidden_states: Optional[torch.Tensor] = None
-
         graph_batch_size = _get_graph_batch_size(
             self.scheduler_config.max_num_seqs)
         batch_size_capture_list = [
@@ -939,18 +876,14 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                 )
 
                 if self.lora_config:
-                    lora_mapping = LoRAMapping(
-                        [0] * batch_size,
-                        [0] * batch_size,
-                    )
+                    lora_mapping = LoRAMapping([0] * batch_size,
+                                               [0] * batch_size, False)
                     self.set_active_loras(set(), lora_mapping)
 
                 graph_runner = CUDAGraphRunner(self.model)
-                hidden_states = graph_runner.capture(
+                graph_runner.capture(
                     input_tokens[:batch_size],
                     input_positions[:batch_size],
-                    hidden_states[:batch_size]
-                    if hidden_states is not None else None,
                     kv_caches,
                     attn_metadata,
                     memory_pool=self.graph_memory_pool,
@@ -987,46 +920,35 @@ def capture(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        hidden_states: Optional[torch.Tensor],
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         memory_pool: Optional[Tuple[int, int]],
         stream: torch.cuda.Stream,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> None:
         assert self._graph is None
-        # Run the model a few times without capturing the graph.
+        # Run the model once without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        # Note one iteration is not enough for torch.jit.script
-        for _ in range(_NUM_WARMUP_ITERS):
-            self.model(
-                input_ids,
-                positions,
-                kv_caches,
-                attn_metadata,
-                **kwargs,
-            )
+        self.model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            **kwargs,
+        )
         torch.cuda.synchronize()
 
         # Capture the graph.
         self._graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
-            output_hidden_states = self.model(
+            hidden_states = self.model(
                 input_ids,
                 positions,
                 kv_caches,
                 attn_metadata,
                 **kwargs,
             )
-            if hidden_states is not None:
-                hidden_states.copy_(output_hidden_states)
-            else:
-                hidden_states = output_hidden_states
-            del output_hidden_states
-            # make sure `output_hidden_states` is deleted
-            # in the graph's memory pool
-            gc.collect()
         torch.cuda.synchronize()
 
         # Save the input and output buffers.
@@ -1039,7 +961,7 @@ def capture(
             "block_tables": attn_metadata.decode_metadata.block_tables,
         }
         self.output_buffers = {"hidden_states": hidden_states}
-        return hidden_states
+        return
 
     def forward(
         self,
@@ -1086,6 +1008,24 @@ def _get_graph_batch_size(batch_size: int) -> int:
                 _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
 
 
+def _prepare_fake_inputs(
+        seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
+    """Prepare fake inputs for profile run."""
+    if vision_language_config:
+        prompt_tokens = [
+            vision_language_config.image_token_id
+        ] * vision_language_config.image_feature_size + [0] * (
+            seq_len - vision_language_config.image_feature_size)
+        fake_image_input = MultiModalData(
+            type=MultiModalData.Type.IMAGE,
+            data=torch.zeros(vision_language_config.image_input_shape,
+                             dtype=torch.float16))
+    else:
+        prompt_tokens = [0] * seq_len
+        fake_image_input = None
+    return SequenceData(prompt_tokens), fake_image_input
+
+
 def _is_block_tables_empty(block_tables: Union[None, Dict]):
     """
     Check if block_tables is None or a dictionary with all None values.

From 4c5889e9664dac5899217cd1e9abb72be06f422a Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 18 Jun 2024 11:45:06 +0800
Subject: [PATCH 26/71] delete punica

---
 tests/lora/test_punica.py         | 234 -----------------------------
 vllm/lora/fully_sharded_layers.py |   1 -
 vllm/lora/punica.py               | 235 +-----------------------------
 3 files changed, 6 insertions(+), 464 deletions(-)
 delete mode 100644 tests/lora/test_punica.py

diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
deleted file mode 100644
index f021c003b132..000000000000
--- a/tests/lora/test_punica.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Based on code from https://github.com/punica-ai/punica
-
-import pytest
-import torch
-
-import vllm.lora.punica as punica
-
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (5e-3, 5e-3),
-        torch.bfloat16: (3e-2, 2e-2),
-        torch.float32: (None, None),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-
-
-def _lora_ref_impl(
-    y_final: torch.Tensor,
-    x: torch.Tensor,
-    wa_T_all: torch.Tensor,
-    wb_T_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    y_stage_1 = torch.empty(
-        (x.size(0), wa_T_all.size(-2)),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    bs = x.shape[0]
-    s = torch.tensor(scale, dtype=torch.float32, device=x.device)
-    for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
-        xi = x[i].unsqueeze(0).to(torch.float32)
-        wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
-        if wb_T_all is not None:
-            wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
-                                                         -2).to(torch.float32)
-
-        tmp = xi @ wa
-        y_stage_1[i] = tmp.squeeze(0)
-        y_final[i] += ((tmp @ wb).squeeze(0) *
-                       s if wb_T_all is not None else y_stage_1[i])
-    return y_final, y_stage_1
-
-
-H1 = H2 = [
-    128,
-    256,
-    512,
-    1024,
-    1152,
-    1280,
-    1536,
-    2048,
-    2304,
-    2560,
-    2752,
-    3072,
-    3328,
-    3456,
-    3584,
-    4096,
-    4608,
-    5120,
-    5504,
-    5632,
-    6144,
-    6400,
-    6848,
-    6912,
-    7168,
-    8192,
-    9216,
-    10240,
-    11008,
-    13824,
-    14336,
-    15360,
-    22016,
-    24576,
-    27392,
-    27648,
-    32000,
-    32256,
-    32512,
-    32768,
-    33024,
-    36864,
-    43264,
-    49152,
-    64000,
-    64256,
-    102400,
-    102656,
-    128000,
-    128256,
-]
-H2 = [64] + H2
-R = [1, 2, 4]
-SEED = [0xabcdabcd987]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("r", R)
-@pytest.mark.parametrize("seed", SEED)
-@torch.inference_mode()
-def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    bs = 32
-    dtype = getattr(torch, dtype_str)
-    device = torch.device("cuda")
-
-    wa_T_all = torch.randn(num_loras,
-                           num_layers,
-                           r,
-                           h1,
-                           dtype=dtype,
-                           device=device)
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype, device=device)
-        y = torch.randn(bs, r, dtype=dtype, device=device)
-
-        y_ref = y.clone()
-        _lora_ref_impl(
-            y_ref,
-            x,
-            wa_T_all,
-            None,
-            indices,
-            layer_idx,
-            1.0,
-        )
-
-        y_our = y.clone()
-        punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
-
-        assert_close(y_ref, y_our)
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("h2", H2)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_lora_correctness(dtype_str, h1, h2, seed, device):
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    r = 8
-    bs = 32
-    scale = 0.123
-    dtype = getattr(torch, dtype_str)
-    torch.set_default_device(device)
-
-    wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype)
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype)
-        y = torch.randn(bs, h2, dtype=dtype)
-
-        y_ref = y.clone()
-        _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
-
-        y_our = y.clone()
-        punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
-                        scale)
-
-        assert_close(y_ref, y_our)
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("h2", H2)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_lora_correctness_slice(dtype_str, h1, h2, seed, device):
-    if h2 % 3 != 0 or h2 // 3 not in H1:
-        pytest.skip("h2 must be divisible by 3 and in supported shapes")
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    r = 8
-    bs = 32
-    scale = 0.123
-    dtype = getattr(torch, dtype_str)
-    torch.set_default_device(device)
-
-    wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-    wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-    wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype)
-        y = torch.randn(bs, h2, dtype=dtype)
-        s = h2 // 3
-
-        y_ref = y.clone()
-        _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
-                       layer_idx, scale)
-        _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
-                       layer_idx, scale)
-        _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
-                       layer_idx, scale)
-
-        y_our = y.clone()
-        punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
-                              layer_idx, scale, 0, s)
-        punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
-                              layer_idx, scale, s, s)
-        punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
-                              layer_idx, scale, s * 2, s)
-
-        assert_close(y_ref[:, :s], y_our[:, :s])
-        assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
-        assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index fbea667a215e..e405d06ef695 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -17,7 +17,6 @@
     MergedQKVParallelLinearWithLora,
     RowParallelLinearWithLoRA,
 )
-from vllm.lora.punica import bgmv, dispatch_bgmv_low_level
 from vllm.lora.punica import (
     add_shrink_triton,
     add_expand_triton,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 7366edf81491..ec4366acf456 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -9,20 +9,6 @@
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
-
-def _raise_import_error(e):
-    if torch.cuda.get_device_capability() < (8, 0):
-        raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0"
-        ) from e
-    else:
-        raise ImportError(
-            "punica LoRA kernels could not be imported. If you built vLLM "
-            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set."
-        ) from e
-
-
 _PARAMS_CACHE: Dict[int, Tuple] = {}
 
 
@@ -30,8 +16,7 @@ def _compute_params(token_lora_tensor: torch.Tensor):
     pointer = token_lora_tensor.data_ptr()
     if pointer not in _PARAMS_CACHE:
         lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
-            token_lora_tensor, return_counts=True
-        )
+            token_lora_tensor, return_counts=True)
         cum_result = torch.cumsum(seq_length_tensor, dim=0)
         b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
         b_seq_start_tensor[1:].copy_(cum_result[:-1])
@@ -54,221 +39,13 @@ def reset_params_cache():
     _PARAMS_CACHE.clear()
 
 
-def _get_prefilling_params(
-    token_lora_tensor: torch.Tensor, cache_clear: bool = False
-):
+def _get_prefilling_params(token_lora_tensor: torch.Tensor,
+                           cache_clear: bool = False):
     if cache_clear:
         reset_params_cache()
     return _compute_params(token_lora_tensor)
 
 
-def bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
-        matrices.
-      indicies: Shape: `[B]`. Indices of the weight matrices.
-      layer_idx: Layer index of the weight matrices.
-      scale: Scaling factor.
-    """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-
-    punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
-
-
-def dispatch_bgmv_low_level(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-    y_offset: int,
-    y_slice_size: int,
-):
-    """
-    Same as `bgmv` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
-
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      w_t_all: Shape: `[None, L, y_slice_size, H1]`. Column partition of
-        all of the transposed LoRA matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
-    """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-    punica_kernels.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        x.size(1),
-        y_slice_size,
-        y_offset,
-    )
-
-
-def add_lora(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-    *,
-    buffer: Optional[torch.Tensor] = None,
-):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      buffer: Optional. Shape: `[B, R]`. Temporary buffer.
-    """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros(
-            (x.size(0), r), dtype=torch.float32, device=x.device
-        )
-    punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-    punica_kernels.dispatch_bgmv(
-        y, buffer, wb_t_all, indicies, layer_idx, scale
-    )
-
-
-def add_lora_slice(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-    y_offset: int,
-    y_slice_size: int,
-    *,
-    buffer: Optional[torch.Tensor] = None,
-):
-    """
-    Same as `add_lora` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
-
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-
-    Args:
-      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-      x: Shape: `[B, H1]`. Input vectors.
-      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-        LoRA A matrices.
-      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-        LoRA B matrices.
-      indicies: Shape: `[B]`. Indices of the LoRA weights.
-      layer_idx: Layer index of LoRA weights.
-      scale: Scaling factor.
-      y_offset: Offset to apply to the starting column of y.
-      y_slice_size: Size of the y column slice.
-    """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default ,refer to:
-        # https://github.com/triton-lang/triton/issues/1387
-        buffer = torch.zeros(
-            (x.size(0), r), dtype=torch.float32, device=x.device
-        )
-    punica_kernels.dispatch_bgmv_low_level(
-        buffer,
-        x,
-        wa_t_all,
-        indicies,
-        layer_idx,
-        1.0,
-        x.size(1),
-        buffer.size(1),
-        0,
-    )
-    punica_kernels.dispatch_bgmv_low_level(
-        y,
-        buffer,
-        wb_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        buffer.size(1),
-        y_slice_size,
-        y_offset,
-    )
-
-
 def add_shrink_triton(
     y: torch.Tensor,
     x: torch.Tensor,
@@ -403,9 +180,9 @@ def add_lora_triton(
     if buffer is None:
         # We set the buffer to be float32 by default ,refer to:
         # https://github.com/triton-lang/triton/issues/1387
-        buffer = torch.zeros(
-            (x.size(0), r), dtype=torch.float32, device=x.device
-        )
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
 
     add_shrink_triton(
         buffer,

From 82560db571fb68e72661229fe37281f5a666aa3e Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 19 Jun 2024 13:38:19 +0800
Subject: [PATCH 27/71] fix bug

---
 tests/lora/test_triton_punica.py |  4 ++--
 vllm/lora/layers.py              |  6 +++---
 vllm/lora/models.py              | 15 +++++++++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index a098aba16456..9aa210db7073 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -2,7 +2,6 @@
 import torch
 
 import vllm._punica_C as punica_kernels
-import vllm.lora.punica as punica
 from vllm.lora.ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
@@ -90,7 +89,8 @@ def assert_close(a, b):
 @torch.inference_mode()
 def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling):
     layer_idx = 0
-    punica.bgmv(out_tensor, inputs, lora_weights, indices, layer_idx, scaling)
+    punica_kernels.dispatch_bgmv(out_tensor, inputs, lora_weights, indices,
+                                 layer_idx, scaling)
     return
 
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 1dd89df3c4f6..abab47f34fdc 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -85,7 +85,7 @@ def _apply_expand_triton(
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
     token_num = indices_info[0]
-    is_prefilling = bool(indices_info[4])
+    is_prefilling = bool(indices_info[5])
     add_expand_triton(
         output,
         x,
@@ -131,7 +131,7 @@ def _apply_lora_triton(
     output = output.view(-1, output.shape[-1])
 
     token_num = indices_info[0]
-    is_prefilling = bool(indices_info[4])
+    is_prefilling = bool(indices_info[5])
     add_lora_triton(
         output,
         x,
@@ -173,7 +173,7 @@ def _apply_lora_triton_nslice(
     output = output.view(-1, output.shape[-1])
 
     token_num = indices_info[0]
-    is_prefilling = bool(indices_info[4])
+    is_prefilling = bool(indices_info[5])
     offset_left = 0
     # TODO fuse these kernels
     for slice_idx in range(len(output_slices)):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index f817bf65ec96..4cb977a25de1 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -87,8 +87,7 @@ def convert_mapping(
             indices_len: List of lengths of the above tensors.
                 Used to index into each tensor. It contains length for
                 (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices). If long_lora doesn't
-                exist, it only contains first 4 entries.
+                embeddings_indices, long_lora_indices,prefilling stage flag). 
     """
     index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
     embedding_indices = index_mapping_indices.copy()
@@ -153,6 +152,10 @@ def convert_mapping(
     ]
     if long_lora_indices_len is not None:
         indices_len.append(long_lora_indices_len)
+    else:
+        #If long_lora doesn'texist,append None
+        indices_len.append(None)
+    indices_len.append(int(mapping.is_prefilling))
     return (
         base_indices,
         sampler_indices,
@@ -428,10 +431,10 @@ def __init__(
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
-        # 5 is the number of indicies tensors.
+        # 6 is the number of indicies tensors.
         # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices,prefilling or decoding
-        self.indices_len: List[Optional[int]] = [None] * 5
+        # embeddings_indices,long_lora_indices,prefilling or decoding
+        self.indices_len: List[Optional[int]] = [None] * 6
 
         self.model: nn.Module = model
         if hasattr(self.model, "supported_lora_modules"):
@@ -588,7 +591,7 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         else:
             self.long_lora_indices.zero_()
         # Maintain the reference
-        self.indices_len[:] = indices_len + [int(mapping.is_prefilling)]
+        self.indices_len[:] = indices_len
         #
         if mapping.is_prefilling:
             punica.reset_params_cache()

From e3ba5a5ae3cef0106b5420d9b670f0c5b39e06a4 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 20 Jun 2024 13:41:57 +0800
Subject: [PATCH 28/71] fix unit test

---
 tests/lora/test_lora.py              |  93 ++++++++++++-----
 tests/lora/test_triton_punica.py     | 149 +++++++++++++--------------
 vllm/lora/fully_sharded_layers.py    |  82 +++------------
 vllm/lora/layers.py                  | 122 ++++++++++------------
 vllm/lora/models.py                  |  10 +-
 vllm/lora/ops/bgmv_expand.py         |   6 +-
 vllm/lora/ops/bgmv_expand_slice.py   |   4 +-
 vllm/lora/ops/bgmv_shrink.py         |   2 +
 vllm/lora/ops/sgmv_expand.py         |   4 +-
 vllm/lora/ops/sgmv_expand_slice.py   |   3 +-
 vllm/lora/ops/utils.py               |   4 +-
 vllm/lora/punica.py                  |  57 ++++++++--
 vllm/model_executor/layers/linear.py |   1 +
 13 files changed, 279 insertions(+), 258 deletions(-)

diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index 3415d36b7e34..4bc959b826bb 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -20,13 +20,16 @@
     torch.bfloat16: (3e-2, 2e-2),
 }
 
+STAGES = [0, 1]  #prefilling(1) or decoding(0)
+
 
 @pytest.mark.parametrize("m", TENSOR_SIZES)
 @pytest.mark.parametrize("n", TENSOR_SIZES)
 @pytest.mark.parametrize("k", BATCH_SIZES)
 @pytest.mark.parametrize("rank", RANKS)
 @pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora(m, n, k, rank, dtype) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_apply_lora(m, n, k, rank, dtype, stage) -> None:
     manager = DummyLoRAManager()
 
     module_name = "module"
@@ -53,19 +56,31 @@ def test_apply_lora(m, n, k, rank, dtype) -> None:
     for i in range(lora_a_stack.shape[0]):
         lora_a_stack[i][0] = lora.lora_a.T
         lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
-
+    indices_info = [None] * 6
+    indices_info[0] = k
+    indices_info[5] = stage
     output = torch.zeros(k, m, device="cuda", dtype=dtype)
-    _apply_lora(
-        input, lora_a_stack, lora_b_stack,
-        torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
-        output)
+    _apply_lora(input,
+                lora_a_stack,
+                lora_b_stack,
+                torch.randint(0,
+                              lora_a_stack.shape[0], (len(input), ),
+                              device="cuda"),
+                indices_info,
+                output,
+                cache_clear=True)
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora(input, lora_a_stack, lora_b_stack,
-                torch.full((len(input), ), -1, device="cuda"), output)
+    _apply_lora(input,
+                lora_a_stack,
+                lora_b_stack,
+                torch.full((len(input), ), -1, device="cuda"),
+                indices_info,
+                output,
+                cache_clear=True)
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
@@ -76,7 +91,8 @@ def test_apply_lora(m, n, k, rank, dtype) -> None:
 @pytest.mark.parametrize("k", BATCH_SIZES)
 @pytest.mark.parametrize("rank", RANKS)
 @pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None:
     if m % 2 != 0:
         pytest.skip("m must be divisible by 2")
     if m // 2 not in TENSOR_SIZES:
@@ -91,7 +107,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
     lora_1 = manager.get_module_lora(module_name + "1")
     manager.init_random_lora(module_name + "2", weight, rank=rank)
     lora_2 = manager.get_module_lora(module_name + "2")
-
+    
     input = torch.rand(k, n, device="cuda", dtype=dtype)
     expected = torch.cat([
         input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
@@ -120,21 +136,32 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
         lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
         lora_a_stacks[1][i][0] = lora_2.lora_a.T
         lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
-
+    indices_info = [None] * 6
+    indices_info[0] = k
+    indices_info[5] = stage
     output = torch.zeros(k, m, device="cuda", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0,
-                      lora_a_stacks[0].shape[0], (len(input), ),
-                      device="cuda"), output, (m // 2, m // 2))
+    _apply_lora_packed_nslice(input,
+                              lora_a_stacks,
+                              lora_b_stacks,
+                              torch.randint(0,
+                                            lora_a_stacks[0].shape[0],
+                                            (len(input), ),
+                                            device="cuda"),
+                              indices_info,
+                              output, (m // 2, m // 2),
+                              cache_clear=True)
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
+    _apply_lora_packed_nslice(input,
+                              lora_a_stacks,
+                              lora_b_stacks,
                               torch.full((len(input), ), -1, device="cuda"),
-                              output, (m // 2, m // 2))
+                              indices_info,
+                              output, (m // 2, m // 2),
+                              cache_clear=True)
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
@@ -145,7 +172,8 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
 @pytest.mark.parametrize("k", BATCH_SIZES)
 @pytest.mark.parametrize("rank", RANKS)
 @pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype, stage) -> None:
     manager = DummyLoRAManager()
 
     module_name = "module"
@@ -204,21 +232,32 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
         lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
         lora_a_stacks[2][i][0] = lora_v.lora_a.T
         lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
-
+    indices_info = [None] * 6
+    indices_info[0] = k
+    indices_info[5] = stage  #decoding stage
     output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0,
-                      lora_a_stacks[0].shape[0], (len(input), ),
-                      device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
+    _apply_lora_packed_nslice(input,
+                              lora_a_stacks,
+                              lora_b_stacks,
+                              torch.randint(0,
+                                            lora_a_stacks[0].shape[0],
+                                            (len(input), ),
+                                            device="cuda"),
+                              indices_info,
+                              output, (qkv[0], qkv[1], qkv[2]),
+                              cache_clear=True)
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
+    _apply_lora_packed_nslice(input,
+                              lora_a_stacks,
+                              lora_b_stacks,
                               torch.full((len(input), ), -1, device="cuda"),
-                              output, (qkv[0], qkv[1], qkv[2]))
+                              indices_info,
+                              output, (qkv[0], qkv[1], qkv[2]),
+                              cache_clear=True)
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 9aa210db7073..d4281004a7a2 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -324,82 +324,81 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
 #         ref_out_tensor = ref_out_tensor.to(torch.float32)
 #     assert_close(our_out_tensor, ref_out_tensor)
 
+# @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+# @pytest.mark.parametrize("scaling", SCALES)
+# @pytest.mark.parametrize("dtype", DTYPES)
+# @pytest.mark.parametrize("op_type", OP_TYPES)
+# @pytest.mark.parametrize("seed", SEED)
+# @pytest.mark.parametrize("device", CUDA_DEVICES)
+# def test_triton_sgmv_punica_bgmv(
+#     hidden_size,
+#     scaling: float,
+#     dtype: torch.dtype,
+#     op_type: str,
+#     seed: int,
+#     device: str,
+# ):
+#     # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
+#     if dtype == torch.float32 or hidden_size == 3424:
+#         return
+#     torch.manual_seed(seed)
+#     torch.set_default_device(device)
+#     batchs = 4  # Arbitrary values for testing
+#     rank = 16  # Arbitrary values for testing
+#     seq_len = 128  # Arbitrary values for testing
+#     num_loras = 8  # Arbitrary values for testing
+#     (
+#         inputs_tensor,
+#         lora_weights,
+#         our_out_tensor,
+#         ref_out_tensor,
+#         b_seq_start_loc,
+#         lora_indices_tensor,
+#         seq_len_tensor,
+#         indices,
+#     ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+#                        op_type, device)
 
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", OP_TYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_triton_sgmv_punica_bgmv(
-    hidden_size,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
-    if dtype == torch.float32 or hidden_size == 3424:
-        return
-    torch.manual_seed(seed)
-    torch.set_default_device(device)
-    batchs = 4  # Arbitrary values for testing
-    rank = 16  # Arbitrary values for testing
-    seq_len = 128  # Arbitrary values for testing
-    num_loras = 8  # Arbitrary values for testing
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
-                       op_type, device)
-
-    max_seq_length = seq_len_tensor.max()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batchs,
-            max_seq_length,
-            scaling,
-        )
-    else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batchs,
-            max_seq_length,
-            add_inputs=True,
-        )
-    lora_weights_4d = lora_weights.unsqueeze(dim=1)
-    _punica_bgmv(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights_4d,
-        indices,
-        scaling if op_type == "shrink" else 1.0,
-    )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
+#     max_seq_length = seq_len_tensor.max()
+#     if isinstance(max_seq_length, tuple):
+#         max_seq_length = max_seq_length[0].item()
+#     else:
+#         max_seq_length = max_seq_length.item()
+#     if op_type == "shrink":
+#         sgmv_shrink(
+#             inputs_tensor,
+#             lora_weights,
+#             our_out_tensor,
+#             b_seq_start_loc,
+#             seq_len_tensor,
+#             lora_indices_tensor,
+#             batchs,
+#             max_seq_length,
+#             scaling,
+#         )
+#     else:
+#         sgmv_expand(
+#             inputs_tensor,
+#             lora_weights,
+#             our_out_tensor,
+#             b_seq_start_loc,
+#             seq_len_tensor,
+#             lora_indices_tensor,
+#             batchs,
+#             max_seq_length,
+#             add_inputs=True,
+#         )
+#     lora_weights_4d = lora_weights.unsqueeze(dim=1)
+#     _punica_bgmv(
+#         ref_out_tensor,
+#         inputs_tensor,
+#         lora_weights_4d,
+#         indices,
+#         scaling if op_type == "shrink" else 1.0,
+#     )
+#     if op_type == "shrink":
+#         ref_out_tensor = ref_out_tensor.to(torch.float32)
+#     assert_close(our_out_tensor, ref_out_tensor)
 
 
 @pytest.mark.parametrize("batchs", BATCHS)
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index e405d06ef695..76544e1d51ca 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -18,9 +18,9 @@
     RowParallelLinearWithLoRA,
 )
 from vllm.lora.punica import (
-    add_shrink_triton,
-    add_expand_triton,
-    add_expand_slice_triton,
+    add_shrink,
+    add_expand,
+    add_expand_slice,
 )
 
 if TYPE_CHECKING:
@@ -71,18 +71,9 @@ def apply(self, x: torch.Tensor,
             dtype=torch.float32,
             device=x.device,
         )
-
-        # bgmv(
-        #     buffer,
-        #     x,
-        #     self.lora_a_stacked,
-        #     self.indices[: self.indices_len[0]],
-        #     0,
-        #     1.0,
-        # )
         token_num = self.indices_len[0]
-        is_prefilling = bool(self.indices_len[4])
-        add_shrink_triton(
+        is_prefilling = bool(self.indices_len[5])
+        add_shrink(
             buffer,
             x,
             self.lora_a_stacked,
@@ -92,15 +83,7 @@ def apply(self, x: torch.Tensor,
             is_prefilling,
         )
         buffer = tensor_model_parallel_all_gather(buffer)
-        # bgmv(
-        #     output,
-        #     buffer,
-        #     self.lora_b_stacked,
-        #     self.indices[: self.indices_len[0]],
-        #     0,
-        #     1.0,
-        # )
-        add_expand_triton(
+        add_expand(
             output,
             buffer,
             self.lora_b_stacked,
@@ -110,7 +93,6 @@ def apply(self, x: torch.Tensor,
             add_input=True,
         )
         # now have column partitioned output
-
         output = output.view(*out_orig_shape)
         return output
 
@@ -138,7 +120,7 @@ def _mcp_apply(x, bias, layer):
     MergedColumnParallelLinearWithShardedLoRA and
     QKVParallelLinearWithShardedLora share the same
     LoRa weight application method.
-
+    
     The main difference is the step by shard_size for lora_b which can
     vary for QKVParallelLinearWithShardedLora but is constant for
     MergedColumnParallelLinearWithShardedLoRA.
@@ -155,18 +137,10 @@ def _mcp_apply(x, bias, layer):
         device=x.device,
     )
     token_num = layer.indices_len[0]
-    is_prefilling = bool(layer.indices_len[4])
+    is_prefilling = bool(layer.indices_len[5])
     for idx in range(n):
-        # bgmv(
-        #     buffers[idx],
-        #     x,
-        #     layer.lora_a_stacked[idx],
-        #     layer.indices[: layer.indices_len[0]],
-        #     0,
-        #     1.0,
-        # )
-
-        add_shrink_triton(
+
+        add_shrink(
             buffers[idx],
             x,
             layer.lora_a_stacked[idx],
@@ -180,17 +154,7 @@ def _mcp_apply(x, bias, layer):
     left_offset = 0
     for idx in range(n):
         shard_size = layer.lora_b_stacked[idx].shape[2]
-        # dispatch_bgmv_low_level(
-        #     output,
-        #     buffers[idx],
-        #     layer.lora_b_stacked[idx],
-        #     layer.indices[: layer.indices_len[0]],
-        #     0,
-        #     1.0,
-        #     left_offset,
-        #     shard_size,
-        # )
-        add_expand_slice_triton(
+        add_expand_slice(
             output,
             buffers[idx],
             layer.lora_b_stacked[idx],
@@ -328,17 +292,9 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
             dtype=torch.float32,
             device=x.device,
         )
-        # bgmv(
-        #     buffer,
-        #     x,
-        #     self.lora_a_stacked,
-        #     self.indices[: self.indices_len[0]],
-        #     0,
-        #     1.0,
-        # )
         token_num = self.indices_len[0]
-        is_prefilling = bool(self.indices_len[4])
-        add_shrink_triton(
+        is_prefilling = bool(self.indices_len[5])
+        add_shrink(
             buffer,
             x,
             self.lora_a_stacked,
@@ -357,17 +313,7 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # reduced before being used
         shard_size = self.lora_b_stacked.shape[2]
         start_idx = self.tp_rank * shard_size
-        # dispatch_bgmv_low_level(
-        #     output,
-        #     buffer,
-        #     self.lora_b_stacked,
-        #     self.indices[: self.indices_len[0]],
-        #     0,
-        #     1.0,
-        #     start_idx,
-        #     shard_size,
-        # )
-        add_expand_slice_triton(
+        add_expand_slice(
             output,
             buffer,
             self.lora_b_stacked,
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index abab47f34fdc..35c974fb6d5f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -21,8 +21,8 @@
 
 # from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.lora.punica import (
-    add_lora_triton,
-    add_expand_triton,
+    add_lora,
+    add_expand,
 )
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -73,7 +73,7 @@ def dec(*args, **kwargs):
     return dec
 
 
-def _apply_expand_triton(
+def _apply_expand(
     x: torch.Tensor,
     lora_b_stacked: torch.Tensor,
     lora_index_tensor: torch.Tensor,
@@ -86,7 +86,7 @@ def _apply_expand_triton(
     output = output.view(-1, output.shape[-1])
     token_num = indices_info[0]
     is_prefilling = bool(indices_info[5])
-    add_expand_triton(
+    add_expand(
         output,
         x,
         lora_b_stacked,
@@ -98,15 +98,14 @@ def _apply_expand_triton(
     return output.view_as(org_output)
 
 
-def _apply_lora_triton(
-    x: torch.Tensor,
-    lora_a_stacked: torch.Tensor,
-    lora_b_stacked: torch.Tensor,
-    lora_index_tensor: torch.Tensor,
-    indices_info: List[int],
-    output: torch.Tensor,
-) -> torch.Tensor:
-    """Applies lora to each input.   This method applies all loras to each
+def _apply_lora(x: torch.Tensor,
+                lora_a_stacked: torch.Tensor,
+                lora_b_stacked: torch.Tensor,
+                lora_index_tensor: torch.Tensor,
+                indices_info: List[int],
+                output: torch.Tensor,
+                cache_clear: bool = False) -> torch.Tensor:
+    """Applies lora to each input. This method applies all loras to each
     input. It uses the `lora_index_tensor` vector to determine which lora
     yields the correct output. An index of -1 means no lora should be
     applied. This method adds the final lora results to the output.
@@ -117,9 +116,9 @@ def _apply_lora_triton(
         lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank)
         lora_index_tensor (torch.Tensor): (batch_size*seq_number,). The LoRA
         index corresponding to each token
-        indices_info: List[int]: 5 is the number of indicies tensors.
-        # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices,prefilling or decoding
+        indices_len(List):(6,), It contains  (base_indices, sampler_indices, 
+            sampler_indices_padded,embeddings_indices, long_lora_indices,
+            prefilling flag). 
         output (torch.Tensor):  (batch_size, output_dim)
 
     Returns:
@@ -131,42 +130,34 @@ def _apply_lora_triton(
     output = output.view(-1, output.shape[-1])
 
     token_num = indices_info[0]
+
     is_prefilling = bool(indices_info[5])
-    add_lora_triton(
-        output,
-        x,
-        lora_a_stacked,
-        lora_b_stacked,
-        lora_index_tensor[:token_num],
-        0,
-        1.0,
-        is_prefilling,
-    )
+    add_lora(output,
+             x,
+             lora_a_stacked,
+             lora_b_stacked,
+             lora_index_tensor[:token_num],
+             0,
+             1.0,
+             is_prefilling,
+             cache_clear=cache_clear)
     return output.view_as(org_output)
 
 
-def _apply_lora_triton_nslice(
-    x: torch.Tensor,
-    lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    lora_index_tensor: torch.Tensor,
-    indices_info: List[int],
-    output: torch.Tensor,
-    output_slices: Tuple[int, ...],
-) -> torch.Tensor:
-    """_summary_
-
-    Args:
-        x (torch.Tensor): _description_
-        lora_a_stacked (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): _description_
-        lora_b_stacked (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): _description_
-        lora_index_tensor (torch.Tensor): _description_
-        indices_info (List[int]): _description_
-        output (torch.Tensor): _description_
-        output_slices (Tuple[int, ...]): _description_
-
-    Returns:
-        torch.Tensor: _description_
+def _apply_lora_packed_nslice(x: torch.Tensor,
+                              lora_a_stacked: Tuple[torch.Tensor, torch.Tensor,
+                                                    torch.Tensor],
+                              lora_b_stacked: Tuple[torch.Tensor, torch.Tensor,
+                                                    torch.Tensor],
+                              lora_index_tensor: torch.Tensor,
+                              indices_info: List[int],
+                              output: torch.Tensor,
+                              output_slices: Tuple[int, ...],
+                              cache_clear: bool = False) -> torch.Tensor:
+    """
+    Applies lora to each input. Similar to _apply_lora, This method is 
+    used for layers that are composed of multiple sublayers
+    (slices) packed together.
     """
     org_output = output
     x = x.view(-1, x.shape[-1])
@@ -177,18 +168,17 @@ def _apply_lora_triton_nslice(
     offset_left = 0
     # TODO fuse these kernels
     for slice_idx in range(len(output_slices)):
-        add_lora_triton(
-            output,
-            x,
-            lora_a_stacked[slice_idx],
-            lora_b_stacked[slice_idx],
-            lora_index_tensor[:token_num],
-            0,
-            1.0,
-            is_prefilling,
-            offset_left,
-            output_slices[slice_idx],
-        )
+        add_lora(output,
+                 x,
+                 lora_a_stacked[slice_idx],
+                 lora_b_stacked[slice_idx],
+                 lora_index_tensor[:token_num],
+                 0,
+                 1.0,
+                 is_prefilling,
+                 offset_left,
+                 output_slices[slice_idx],
+                 cache_clear=cache_clear)
         offset_left += output_slices[slice_idx]
 
     return output.view_as(org_output)
@@ -407,7 +397,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-        _apply_expand_triton(
+        _apply_expand(
             full_lora_a_embeddings,
             self.lora_b_stacked,
             self.indices,
@@ -526,7 +516,7 @@ def set_mapping(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_triton(
+        _apply_lora(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
@@ -687,7 +677,7 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_triton_nslice(
+        _apply_lora_packed_nslice(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
@@ -957,7 +947,7 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_triton_nslice(
+        _apply_lora_packed_nslice(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
@@ -1078,7 +1068,7 @@ def set_mapping(
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
         # maybe we need not  restrict  range to [:batch_size]
-        _apply_lora_triton(
+        _apply_lora(
             x,
             self.lora_a_stacked,
             self.lora_b_stacked,
@@ -1301,7 +1291,7 @@ def _get_logits(
         # sampler_indices
         sampler_indices = self.indices_len[1]
         is_prefilling = False
-        add_lora_triton(
+        add_lora(
             logits,
             hidden_states,
             self.lora_a_stacked,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 4cb977a25de1..b7923ce4de8e 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -84,10 +84,10 @@ def convert_mapping(
             long_lora_indices: Tensor of shape [batch_size] mapping
                 requests to RoPE offsets and rot dims for long LoRAs.
                 None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors.
-                Used to index into each tensor. It contains length for
+            indices_len: List of lengths of the above tensors and prefilling 
+                flag.Used to index into each tensor. It contains  
                 (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices,prefilling stage flag). 
+                embeddings_indices, long_lora_indices,prefilling  flag). 
     """
     index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
     embedding_indices = index_mapping_indices.copy()
@@ -153,8 +153,10 @@ def convert_mapping(
     if long_lora_indices_len is not None:
         indices_len.append(long_lora_indices_len)
     else:
-        #If long_lora doesn'texist,append None
+        #If long_lora doesn't exist,append None
         indices_len.append(None)
+    # Append a prefilling flag to help selecting the appropriate lora
+    # ops (sgmv or bgmv)
     indices_len.append(int(mapping.is_prefilling))
     return (
         base_indices,
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index b977540cbfb4..ec68c6d20f98 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -103,8 +103,10 @@ def bgmv_expand(
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
         batchs (int): batch size
-        add_inputs (bool, optional): _description_. Defaults to False.
-        cast_type (bool, optional): _description_. Defaults to False.
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None. 
+            Triton grid config
     """
 
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index c741d10e9c9d..af343d6eae1c 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -109,7 +109,9 @@ def bgmv_expand_slice(
         slice_offst (int): output_tensor's offst
         slice_size (int): current output_tensor's size
         batchs (int): batch size
-        add_inputs (bool, optional): _description_. Defaults to False.
+        add_inputs (bool, optional): Defaults to False.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None. 
+            Triton grid config
     """
 
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index a7087a96488f..6b92ed72c4c2 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -97,6 +97,8 @@ def bgmv_shrink(
             corresponding to each batch
         batchs (int): batch size
         scaling (float):  Scaling factor.
+        override_config (Optional[Dict[str, int]], optional): Defaults to None. 
+            Triton grid config
     """
     assert inputs.dtype == lora_a_weights.dtype
     assert inputs.dtype in [torch.float16, torch.bfloat16]
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index f34eec0357bd..879184db0b8b 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -122,8 +122,8 @@ def sgmv_expand(
         batchs (int): batch size
         max_seq_length (int):  The max sequence lengths of the sequences
             in the batch
-        add_inputs (bool, optional): _description_. Defaults to False.
-        cast_type (bool, optional): _description_. Defaults to False.
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output.
     """
 
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 2fdedd591032..000fef304823 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -133,7 +133,8 @@ def sgmv_expand_slice(
             in the batch
         slice_offst (int): output_tensor's offst
         slice_size (int): current output_tensor's size
-        add_inputs (bool, optional): _description_. Defaults to False.
+        add_inputs (bool, optional):  Defaults to False. adds the final lora 
+            results to the output..
     """
 
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index 6124916cfd9d..e08b4409af75 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -1,6 +1,7 @@
 import functools
 import json
 import os
+import torch
 from typing import Dict
 
 
@@ -9,8 +10,7 @@ def _get_config_file_name(
     batchs: int,
     hidden_size: int,
 ) -> str:
-    # device_name = torch.cuda.get_device_name().replace(" ", "_")
-    device_name = "NVIDIA_GeForce_RTX_3090"
+    device_name = torch.cuda.get_device_name().replace(" ", "_")
     return (f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} " +
             f"device_name={device_name}.json")
 
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index ec4366acf456..695fd7446945 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -12,7 +12,12 @@
 _PARAMS_CACHE: Dict[int, Tuple] = {}
 
 
-def _compute_params(token_lora_tensor: torch.Tensor):
+def _compute_params(
+    token_lora_tensor: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:
+    """
+    Get the information required for the sgmv kernel.
+    """
     pointer = token_lora_tensor.data_ptr()
     if pointer not in _PARAMS_CACHE:
         lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
@@ -36,6 +41,7 @@ def reset_params_cache():
     """At the beginning of the prefilling stage, we need  clear the
     cache explicitly
     """
+    #TODO release gpu memory
     _PARAMS_CACHE.clear()
 
 
@@ -46,7 +52,7 @@ def _get_prefilling_params(token_lora_tensor: torch.Tensor,
     return _compute_params(token_lora_tensor)
 
 
-def add_shrink_triton(
+def add_shrink(
     y: torch.Tensor,
     x: torch.Tensor,
     w_t_all: torch.Tensor,
@@ -56,6 +62,10 @@ def add_shrink_triton(
     is_prefilling: bool,
     cache_clear: bool = False,
 ):
+    """
+    y=x@w_t_all
+    When `is_prefilling` is True, will lanuch `sgmv_shrink`
+    """
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -79,7 +89,7 @@ def add_shrink_triton(
         bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale)
 
 
-def add_expand_triton(
+def add_expand(
     y: torch.Tensor,
     x: torch.Tensor,
     w_t_all: torch.Tensor,
@@ -89,6 +99,10 @@ def add_expand_triton(
     add_input: bool = True,
     cache_clear: bool = False,
 ):
+    """
+    y+=x@w_t_all
+    When `is_prefilling` is True, will lanuch `sgmv_expand`, 
+    """
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -112,7 +126,7 @@ def add_expand_triton(
         bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_inputs=add_input)
 
 
-def add_expand_slice_triton(
+def add_expand_slice(
     y: torch.Tensor,
     x: torch.Tensor,
     w_t_all: torch.Tensor,
@@ -124,6 +138,9 @@ def add_expand_slice_triton(
     add_input: bool = True,
     cache_clear: bool = False,
 ):
+    """
+    y+=x@w_t_all
+    """
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -157,7 +174,7 @@ def add_expand_slice_triton(
         )
 
 
-def add_lora_triton(
+def add_lora(
     y: torch.Tensor,
     x: torch.Tensor,
     wa_t_all: torch.Tensor,
@@ -173,9 +190,29 @@ def add_lora_triton(
     cache_clear: bool = False,
 ):
     """
-    Same as `add_lora_triton` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+    Args:
+        y (torch.Tensor):  Output tensor. Will be changed in-place.
+        x (torch.Tensor): Input tensor
+        wa_t_all (torch.Tensor): lora_a's weight
+        wb_t_all (torch.Tensor): lora_b's weight
+        lora_indices_tensor (torch.Tensor): _description_
+        layer_idx (int): Layer index of LoRA weights.
+        scale (float): Scaling factor.
+        is_prefilling (bool): prefiling stage
+        y_offset (Optional[int], optional): Offset to apply to the starting 
+            column of y.
+        y_slice_size (Optional[int], optional): Size of the y column slice..
+        buffer (Optional[torch.Tensor], optional): Defaults to None.
+        cache_clear (bool, optional):  Defaults to False.
     """
+
     r = wb_t_all.size(-1)
     if buffer is None:
         # We set the buffer to be float32 by default ,refer to:
@@ -184,7 +221,7 @@ def add_lora_triton(
                              dtype=torch.float32,
                              device=x.device)
 
-    add_shrink_triton(
+    add_shrink(
         buffer,
         x,
         wa_t_all,
@@ -195,7 +232,7 @@ def add_lora_triton(
         cache_clear=cache_clear,
     )
     if y_offset is None and y_slice_size is None:
-        add_expand_triton(
+        add_expand(
             y,
             buffer,
             wb_t_all,
@@ -206,7 +243,7 @@ def add_lora_triton(
             cache_clear=cache_clear,
         )
     else:
-        add_expand_slice_triton(
+        add_expand_slice(
             y,
             buffer,
             wb_t_all,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 34fbfa8e33ef..3b1a846f0d1b 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -93,6 +93,7 @@ def apply(self,
             if bias is not None:
                 return F.linear(x, weight) + bias
             return F.linear(x, weight)
+        
         return F.linear(x, weight, bias)
 
 
From 348c4a4ea1c32efebd0f915f7cdb4abb41293d93 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 20 Jun 2024 13:48:10 +0800
Subject: [PATCH 29/71] reformat

---
 tests/lora/test_lora.py              |   2 +-
 tests/lora/test_triton_punica.py     | 149 ++++++++++++++-------------
 vllm/lora/fully_sharded_layers.py    |  20 ++--
 vllm/lora/layers.py                  |  38 +++----
 vllm/lora/models.py                  |  19 ++--
 vllm/lora/ops/utils.py               |   3 +-
 vllm/lora/punica.py                  |  12 ++-
 vllm/model_executor/layers/linear.py |   1 -
 8 files changed, 111 insertions(+), 133 deletions(-)

diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index 4bc959b826bb..51708c8fa6e5 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -107,7 +107,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None:
     lora_1 = manager.get_module_lora(module_name + "1")
     manager.init_random_lora(module_name + "2", weight, rank=rank)
     lora_2 = manager.get_module_lora(module_name + "2")
-    
+
     input = torch.rand(k, n, device="cuda", dtype=dtype)
     expected = torch.cat([
         input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index d4281004a7a2..9aa210db7073 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -324,81 +324,82 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
 #         ref_out_tensor = ref_out_tensor.to(torch.float32)
 #     assert_close(our_out_tensor, ref_out_tensor)
 
-# @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-# @pytest.mark.parametrize("scaling", SCALES)
-# @pytest.mark.parametrize("dtype", DTYPES)
-# @pytest.mark.parametrize("op_type", OP_TYPES)
-# @pytest.mark.parametrize("seed", SEED)
-# @pytest.mark.parametrize("device", CUDA_DEVICES)
-# def test_triton_sgmv_punica_bgmv(
-#     hidden_size,
-#     scaling: float,
-#     dtype: torch.dtype,
-#     op_type: str,
-#     seed: int,
-#     device: str,
-# ):
-#     # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
-#     if dtype == torch.float32 or hidden_size == 3424:
-#         return
-#     torch.manual_seed(seed)
-#     torch.set_default_device(device)
-#     batchs = 4  # Arbitrary values for testing
-#     rank = 16  # Arbitrary values for testing
-#     seq_len = 128  # Arbitrary values for testing
-#     num_loras = 8  # Arbitrary values for testing
-#     (
-#         inputs_tensor,
-#         lora_weights,
-#         our_out_tensor,
-#         ref_out_tensor,
-#         b_seq_start_loc,
-#         lora_indices_tensor,
-#         seq_len_tensor,
-#         indices,
-#     ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
-#                        op_type, device)
 
-#     max_seq_length = seq_len_tensor.max()
-#     if isinstance(max_seq_length, tuple):
-#         max_seq_length = max_seq_length[0].item()
-#     else:
-#         max_seq_length = max_seq_length.item()
-#     if op_type == "shrink":
-#         sgmv_shrink(
-#             inputs_tensor,
-#             lora_weights,
-#             our_out_tensor,
-#             b_seq_start_loc,
-#             seq_len_tensor,
-#             lora_indices_tensor,
-#             batchs,
-#             max_seq_length,
-#             scaling,
-#         )
-#     else:
-#         sgmv_expand(
-#             inputs_tensor,
-#             lora_weights,
-#             our_out_tensor,
-#             b_seq_start_loc,
-#             seq_len_tensor,
-#             lora_indices_tensor,
-#             batchs,
-#             max_seq_length,
-#             add_inputs=True,
-#         )
-#     lora_weights_4d = lora_weights.unsqueeze(dim=1)
-#     _punica_bgmv(
-#         ref_out_tensor,
-#         inputs_tensor,
-#         lora_weights_4d,
-#         indices,
-#         scaling if op_type == "shrink" else 1.0,
-#     )
-#     if op_type == "shrink":
-#         ref_out_tensor = ref_out_tensor.to(torch.float32)
-#     assert_close(our_out_tensor, ref_out_tensor)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", OP_TYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_triton_sgmv_punica_bgmv(
+    hidden_size,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
+    if dtype == torch.float32 or hidden_size == 3424:
+        return
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+    batchs = 4  # Arbitrary values for testing
+    rank = 16  # Arbitrary values for testing
+    seq_len = 128  # Arbitrary values for testing
+    num_loras = 8  # Arbitrary values for testing
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+                       op_type, device)
+
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batchs,
+            max_seq_length,
+            add_inputs=True,
+        )
+    lora_weights_4d = lora_weights.unsqueeze(dim=1)
+    _punica_bgmv(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights_4d,
+        indices,
+        scaling if op_type == "shrink" else 1.0,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
 
 
 @pytest.mark.parametrize("batchs", BATCHS)
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 76544e1d51ca..a2cb031c8ca9 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -7,21 +7,13 @@
 
 from vllm.config import LoRAConfig
 from vllm.distributed.communication_op import (
-    tensor_model_parallel_all_gather,
-    tensor_model_parallel_all_reduce,
-)
+    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
 from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
-from vllm.lora.layers import (
-    ColumnParallelLinearWithLoRA,
-    MergedColumnParallelLinearWithLoRA,
-    MergedQKVParallelLinearWithLora,
-    RowParallelLinearWithLoRA,
-)
-from vllm.lora.punica import (
-    add_shrink,
-    add_expand,
-    add_expand_slice,
-)
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithLora,
+                              RowParallelLinearWithLoRA)
+from vllm.lora.punica import add_expand, add_expand_slice, add_shrink
 
 if TYPE_CHECKING:
     pass
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 35c974fb6d5f..80bd2dd765ea 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,6 +1,6 @@
 # pylint: disable=unused-argument
 import math
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -9,34 +9,24 @@
 from transformers import PretrainedConfig
 
 from vllm.config import LoRAConfig
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-    split_tensor_along_last_dim,
-    tensor_model_parallel_all_gather,
-    tensor_model_parallel_all_reduce,
-    tensor_model_parallel_gather,
-)
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce,
+                              tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-
 # from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.punica import (
-    add_lora,
-    add_expand,
-)
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    RowParallelLinear,
-)
+from vllm.lora.punica import add_expand, add_lora
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
-    LinearScalingRotaryEmbedding,
-    RotaryEmbedding,
-)
+    LinearScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, )
+    VocabParallelEmbedding)
 
 if TYPE_CHECKING:
     pass
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b7923ce4de8e..d34725523c9c 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -12,22 +12,15 @@
 
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import (
-    BaseLayerWithLoRA,
-    LinearScalingRotaryEmbeddingWithLora,
-    LoRAMapping,
-)
+from vllm.lora import punica
+from vllm.lora.layers import (BaseLayerWithLoRA,
+                              LinearScalingRotaryEmbeddingWithLora,
+                              LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.utils import (
-    from_layer,
-    from_layer_logits_processor,
-    parse_fine_tuned_lora_name,
-    replace_submodule,
-)
+from vllm.lora.utils import (from_layer, from_layer_logits_processor,
+                             parse_fine_tuned_lora_name, replace_submodule)
 from vllm.utils import LRUCache, is_pin_memory_available
 
-from vllm.lora import punica
-
 logger = init_logger(__name__)
 
 _GLOBAL_LORA_ID = 0
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index e08b4409af75..980dc8c6693f 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -1,9 +1,10 @@
 import functools
 import json
 import os
-import torch
 from typing import Dict
 
+import torch
+
 
 def _get_config_file_name(
     op_type: str,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 695fd7446945..321fccc9df93 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -1,7 +1,9 @@
 # Based on code from https://github.com/punica-ai/punica
 
-from typing import Optional, Dict, Tuple
+from typing import Dict, Optional, Tuple
+
 import torch
+
 from vllm.lora.ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
@@ -64,7 +66,7 @@ def add_shrink(
 ):
     """
     y=x@w_t_all
-    When `is_prefilling` is True, will lanuch `sgmv_shrink`
+    When `is_prefilling` is True, will launch `sgmv_shrink`
     """
     if is_prefilling:
         (
@@ -101,7 +103,7 @@ def add_expand(
 ):
     """
     y+=x@w_t_all
-    When `is_prefilling` is True, will lanuch `sgmv_expand`, 
+    When `is_prefilling` is True, will launch `sgmv_expand`, 
     """
     if is_prefilling:
         (
@@ -133,8 +135,8 @@ def add_expand_slice(
     lora_indices_tensor: torch.Tensor,
     layer_idx: int,
     is_prefilling: bool,
-    y_offset: int,
-    y_slice_size: int,
+    y_offset: Optional[int],
+    y_slice_size: Optional[int],
     add_input: bool = True,
     cache_clear: bool = False,
 ):
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 3b1a846f0d1b..34fbfa8e33ef 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -93,7 +93,6 @@ def apply(self,
             if bias is not None:
                 return F.linear(x, weight) + bias
             return F.linear(x, weight)
-        
         return F.linear(x, weight, bias)
 
 
From fa27688239e2160cdd8cfffbc7eb793fe26a906a Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 21 Jun 2024 01:22:46 +0800
Subject: [PATCH 30/71] update

---
 vllm/lora/layers.py | 22 -----------
 vllm/lora/punica.py | 89 ---------------------------------------------
 2 files changed, 111 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 00bd6278bb0e..a4deff6c221f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -258,27 +258,6 @@ def __init__(self, base_layer: VocabParallelEmbedding) -> None:
         self.embeddings_weights: Optional[torch.Tensor]
 
     def create_lora_weights(
-<<<<<<< HEAD
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        lora_vocab_start_idx = self.base_layer.org_vocab_size
-        weights_idx = None
-        if self.base_layer.vocab_end_index > lora_vocab_start_idx:
-            # We can start adding lora weights
-            weights_idx = max(
-                lora_vocab_start_idx - self.base_layer.vocab_start_index, 0)
-            self.embeddings_slice = (
-                self.base_layer.vocab_start_index -
-                self.base_layer.org_vocab_size + weights_idx,
-                self.base_layer.vocab_end_index -
-                self.base_layer.org_vocab_size,
-            )
-            self.embeddings_weights = self.base_layer.weight.data[weights_idx:]
-            self.embeddings_weights.fill_(0)
-=======
             self,
             max_loras: int,
             lora_config: LoRAConfig,
@@ -297,7 +276,6 @@ def create_lora_weights(
                 self.base_layer.org_vocab_size)
             self.base_layer.weight.data[
                 self.base_layer.num_org_embeddings_per_partition:].fill_(0)
->>>>>>> main
         else:
             self.embeddings_slice = None
             self.embeddings_weights = None
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index a1418d23c5e3..321fccc9df93 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,7 +4,6 @@
 
 import torch
 
-<<<<<<< HEAD
 from vllm.lora.ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
@@ -47,23 +46,6 @@ def reset_params_cache():
     #TODO release gpu memory
     _PARAMS_CACHE.clear()
 
-=======
-from vllm import _custom_ops as ops
-
-
-def _check_punica_support():
-    if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
-        return
-
-    if torch.cuda.get_device_capability() < (8, 0):
-        raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0")
-    else:
-        raise ImportError(
-            "punica LoRA kernels could not be imported. If you built vLLM "
-            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set.")
->>>>>>> main
 
 def _get_prefilling_params(token_lora_tensor: torch.Tensor,
                            cache_clear: bool = False):
@@ -86,7 +68,6 @@ def add_shrink(
     y=x@w_t_all
     When `is_prefilling` is True, will launch `sgmv_shrink`
     """
-<<<<<<< HEAD
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -108,11 +89,6 @@ def add_shrink(
         )
     else:
         bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale)
-=======
-    _check_punica_support()
-
-    ops.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
->>>>>>> main
 
 
 def add_expand(
@@ -129,7 +105,6 @@ def add_expand(
     y+=x@w_t_all
     When `is_prefilling` is True, will launch `sgmv_expand`, 
     """
-<<<<<<< HEAD
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -151,21 +126,6 @@ def add_expand(
         )
     else:
         bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_inputs=add_input)
-=======
-    _check_punica_support()
-
-    ops.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        x.size(1),
-        y_slice_size,
-        y_offset,
-    )
->>>>>>> main
 
 
 def add_expand_slice(
@@ -183,7 +143,6 @@ def add_expand_slice(
     """
     y+=x@w_t_all
     """
-<<<<<<< HEAD
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -216,36 +175,6 @@ def add_expand_slice(
             add_inputs=add_input,
         )
 
-=======
-    _check_punica_support()
-
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default to avoid
-        # numerical inaccuracies that would otherwise happen
-        # due to downcasting.
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
-    ops.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-    ops.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, scale)
-
-
-def add_lora_slice(y: torch.Tensor,
-                   x: torch.Tensor,
-                   wa_t_all: torch.Tensor,
-                   wb_t_all: torch.Tensor,
-                   indicies: torch.LongTensor,
-                   layer_idx: int,
-                   scale: float,
-                   y_offset: int,
-                   y_slice_size: int,
-                   *,
-                   buffer: Optional[torch.Tensor] = None):
-    """
-    Same as `add_lora` but you can operate on slices of y.
-    Pass whole y, define y_offset and y_slice_size.
->>>>>>> main
 
 def add_lora(
     y: torch.Tensor,
@@ -285,10 +214,6 @@ def add_lora(
         buffer (Optional[torch.Tensor], optional): Defaults to None.
         cache_clear (bool, optional):  Defaults to False.
     """
-<<<<<<< HEAD
-=======
-    _check_punica_support()
->>>>>>> main
 
     r = wb_t_all.size(-1)
     if buffer is None:
@@ -297,27 +222,13 @@ def add_lora(
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
                              device=x.device)
-<<<<<<< HEAD
 
     add_shrink(
-=======
-    ops.dispatch_bgmv_low_level(
->>>>>>> main
         buffer,
         x,
         wa_t_all,
         lora_indices_tensor,
         0,
-<<<<<<< HEAD
-=======
-    )
-    ops.dispatch_bgmv_low_level(
-        y,
-        buffer,
-        wb_t_all,
-        indicies,
-        layer_idx,
->>>>>>> main
         scale,
         is_prefilling,
         cache_clear=cache_clear,

From 0f71cc4cdb24f6d7f54a62319b06323c3e46aca4 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 21 Jun 2024 21:48:30 +0800
Subject: [PATCH 31/71] delete punica test

---
 tests/lora/test_punica.py | 257 --------------------------------------
 1 file changed, 257 deletions(-)
 delete mode 100644 tests/lora/test_punica.py

diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
deleted file mode 100644
index 110c9b243507..000000000000
--- a/tests/lora/test_punica.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Based on code from https://github.com/punica-ai/punica
-
-import pytest
-import torch
-
-import vllm.lora.punica as punica
-
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (5e-3, 5e-3),
-        torch.bfloat16: (3e-2, 2e-2),
-        torch.float32: (None, None),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-
-
-def _lora_ref_impl(
-    y_final: torch.Tensor,
-    x: torch.Tensor,
-    wa_T_all: torch.Tensor,
-    wb_T_all: torch.Tensor,
-    indicies: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    y_stage_1 = torch.empty(
-        (x.size(0), wa_T_all.size(-2)),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    bs = x.shape[0]
-    s = torch.tensor(scale, dtype=torch.float32, device=x.device)
-    for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
-        xi = x[i].unsqueeze(0).to(torch.float32)
-        wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
-        if wb_T_all is not None:
-            wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
-                                                         -2).to(torch.float32)
-
-        tmp = xi @ wa
-        y_stage_1[i] = tmp.squeeze(0)
-        y_final[i] += ((tmp @ wb).squeeze(0) *
-                       s if wb_T_all is not None else y_stage_1[i])
-    return y_final, y_stage_1
-
-
-H1 = H2 = [
-    128,
-    256,
-    512,
-    896,
-    1024,
-    1152,
-    1216,
-    1280,
-    1536,
-    1664,
-    2048,
-    2240,
-    2304,
-    2368,
-    2432,
-    2560,
-    2752,
-    3072,
-    3328,
-    3456,
-    3584,
-    3712,
-    4096,
-    4480,
-    4608,
-    4736,
-    4864,
-    5120,
-    5504,
-    5632,
-    5888,
-    6144,
-    6400,
-    6848,
-    6912,
-    7168,
-    7424,
-    8192,
-    8960,
-    9216,
-    9472,
-    10240,
-    11008,
-    11264,
-    13824,
-    14336,
-    14784,
-    14848,
-    15360,
-    18944,
-    22016,
-    22528,
-    24576,
-    27392,
-    27648,
-    29568,
-    29696,
-    32000,
-    32256,
-    32512,
-    32768,
-    33024,
-    36864,
-    43264,
-    49152,
-    60544,
-    60672,
-    64000,
-    64256,
-    102400,
-    102656,
-    128000,
-    128256,
-]
-H2 = [64] + H2
-R = [1, 2, 4]
-SEED = [0xabcdabcd987]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("r", R)
-@pytest.mark.parametrize("seed", SEED)
-@torch.inference_mode()
-def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    bs = 32
-    dtype = getattr(torch, dtype_str)
-    device = torch.device("cuda")
-
-    wa_T_all = torch.randn(num_loras,
-                           num_layers,
-                           r,
-                           h1,
-                           dtype=dtype,
-                           device=device)
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype, device=device)
-        y = torch.randn(bs, r, dtype=dtype, device=device)
-
-        y_ref = y.clone()
-        _lora_ref_impl(
-            y_ref,
-            x,
-            wa_T_all,
-            None,
-            indices,
-            layer_idx,
-            1.0,
-        )
-
-        y_our = y.clone()
-        punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
-
-        assert_close(y_ref, y_our)
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("h2", H2)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_lora_correctness(dtype_str, h1, h2, seed, device):
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    r = 8
-    bs = 32
-    scale = 0.123
-    dtype = getattr(torch, dtype_str)
-    torch.set_default_device(device)
-
-    wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype)
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype)
-        y = torch.randn(bs, h2, dtype=dtype)
-
-        y_ref = y.clone()
-        _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
-
-        y_our = y.clone()
-        punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
-                        scale)
-
-        assert_close(y_ref, y_our)
-
-
-@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
-@pytest.mark.parametrize("h1", H1)
-@pytest.mark.parametrize("h2", H2)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_lora_correctness_slice(dtype_str, h1, h2, seed, device):
-    if h2 % 3 != 0 or h2 // 3 not in H1:
-        pytest.skip("h2 must be divisible by 3 and in supported shapes")
-    torch.manual_seed(seed)
-    num_loras = 4
-    num_layers = 1
-    r = 8
-    bs = 32
-    scale = 0.123
-    dtype = getattr(torch, dtype_str)
-    torch.set_default_device(device)
-
-    wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
-    wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-    wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-    wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
-
-    indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
-
-    for layer_idx in range(num_layers):
-        x = torch.randn(bs, h1, dtype=dtype)
-        y = torch.randn(bs, h2, dtype=dtype)
-        s = h2 // 3
-
-        y_ref = y.clone()
-        _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
-                       layer_idx, scale)
-        _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
-                       layer_idx, scale)
-        _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
-                       layer_idx, scale)
-
-        y_our = y.clone()
-        punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
-                              layer_idx, scale, 0, s)
-        punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
-                              layer_idx, scale, s, s)
-        punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
-                              layer_idx, scale, s * 2, s)
-
-        assert_close(y_ref[:, :s], y_our[:, :s])
-        assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
-        assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])

From b36a92e00ba332eb9014c1e396eacc0b0ae418c4 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 25 Jun 2024 15:00:11 +0800
Subject: [PATCH 32/71] fix bug

---
 vllm/worker/model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 69ab24a872e4..c88221de127b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -635,7 +635,7 @@ def _prepare_model_input(
 
         if self.lora_config:
             lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping,
-                                       bool(attn_metadata.prefill_metadata))
+                                       is_prompt)
         else:
             lora_mapping = None
 

From 6f06eb8455c46edd48e12bf3da8c9f2308aa6da5 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 26 Jun 2024 16:43:38 +0800
Subject: [PATCH 33/71] optimize unit test

---
 tests/lora/test_triton_punica.py   | 477 ++++++++++-------------------
 vllm/lora/ops/bgmv_expand.py       |   5 -
 vllm/lora/ops/bgmv_expand_slice.py |   3 +-
 vllm/lora/ops/bgmv_shrink.py       |   5 +-
 4 files changed, 169 insertions(+), 321 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index e55f1373aa2a..56df321714a4 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -1,7 +1,8 @@
+import random
+
 import pytest
 import torch
 
-from vllm._custom_ops import dispatch_bgmv, dispatch_bgmv_low_level
 from vllm.lora.ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
@@ -9,7 +10,6 @@
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
-# The current punica kernel supports dimension and adds a dimension of 3424.
 HIDDEN_SIZES = [
     128,
     256,
@@ -62,19 +62,14 @@
     128256,
 ]
 
-_BATCH_SIZE_ALIGNMENT = 8
-
-# vllm support batch size
-BATCHS = [1, 2, 4] + [_BATCH_SIZE_ALIGNMENT * i for i in range(1, 8)]
+BATCHS = [1, 2, 4] + [8 * i for i in range(1, 4)]
 
-NUM_LORA = [1, 4, 8, 16, 32, 64, 128, 256]
+NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
 DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
+MAX_RANKS = [8, 16, 32, 64]
 SCALES = [0.5]
-OP_TYPES = ["shrink", "expand"]
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
-NSLICES = [2, 3]
 
 
 def assert_close(a, b):
@@ -86,14 +81,6 @@ def assert_close(a, b):
     torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
-@torch.inference_mode()
-def _punica_bgmv(out_tensor, inputs, lora_weights, indices, scaling):
-    layer_idx = 0
-    dispatch_bgmv(out_tensor, inputs, lora_weights, indices, layer_idx,
-                  scaling)
-    return
-
-
 def _torch_groupgemm(
     out_tensor,
     inputs,
@@ -121,11 +108,10 @@ def _torch_groupgemm(
     return
 
 
-def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
+def _generate_data(batchs, hidden_size, lora_nums, max_rank, seq_length, dtype,
                    op_type, device):
-    if max_length == 1:
-        max_length += 1
-    seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batchs, )).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
@@ -143,11 +129,8 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
                                      dtype=dtype,
                                      device=inputs_tensor.device)
         # NOTE  shrink kernel using torch.float32 as output type
-        our_out_tensor = torch.zeros(
-            (total_tokens, max_rank),
-            dtype=torch.float32,
-            device=inputs_tensor.device,
-        )
+        our_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=torch.float32).to(device)
     else:
         inputs_tensor = torch.rand(
             (total_tokens, max_rank),
@@ -162,11 +145,9 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
         ref_out_tensor = torch.rand(
             (total_tokens, hidden_size),
             dtype=dtype,
-            device=inputs_tensor.device,
-        )
+        ).to(device)
         # Ensure the same input.
         our_out_tensor = ref_out_tensor.clone()
-
     lora_indices_tensor = torch.randint(0,
                                         lora_nums - 1 if lora_nums > 1 else 1,
                                         (batchs, )).to(device)
@@ -175,7 +156,7 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
     for b_id in range(batchs):
         lora_index = lora_indices_tensor[b_id]
         indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = lora_index.item()
+                seq_len_tensor[b_id]].copy_(lora_index)
         current_offset += seq_len_tensor[b_id].item()
     return (
         inputs_tensor,
@@ -190,164 +171,86 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, max_length, dtype,
 
 
 def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
-                                  max_length, dtype, nslices, device):
-    if max_length == 1:
-        max_length += 1
-    seq_len_tensor = torch.randint(1, max_length, (batchs, )).to(device)
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-        dim=0,
-    ).to(device)
-    total_tokens = seq_len_tensor.sum()
-
-    inputs_tensor = torch.rand(
-        (total_tokens, max_rank),
-        dtype=dtype,
-    ).to(device)
-    lora_weights_lst = []
-    for _ in range(nslices):
-        lora_weights_lst.append(
-            torch.rand(
-                (lora_nums, hidden_size, max_rank),  # col-major
-                dtype=dtype,
-            ).to(device))
-    # expand op needs to complete y+=a@lora_b, so output is
-    # initinized randomly
-    ref_out_tensor = torch.rand(
-        (total_tokens, hidden_size * nslices),
-        dtype=dtype,
-        device=inputs_tensor.device,
-    )
-    # Ensure the same input.
-    our_out_tensor = ref_out_tensor.clone()
-
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batchs, )).to(device)
-    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
-    current_offset = 0
-    for b_id in range(batchs):
-        lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = lora_index.item()
-        current_offset += seq_len_tensor[b_id].item()
-    return (
-        inputs_tensor,
-        lora_weights_lst,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    )
-
-
-# @pytest.mark.parametrize("batchs", BATCHS)
-# @pytest.mark.parametrize("num_loras", NUM_LORA)
-# @pytest.mark.parametrize("rank", MAX_RANKS)
-# @pytest.mark.parametrize("scaling", SCALES)
-# @pytest.mark.parametrize("dtype", DTYPES)
-# @pytest.mark.parametrize("op_type", OP_TYPES)
-# @pytest.mark.parametrize("seed", SEED)
-# @pytest.mark.parametrize("device", CUDA_DEVICES)
-# def test_sgmv_torch(
-#     batchs: int,
-#     num_loras: int,
-#     rank: int,
-#     scaling: float,
-#     dtype: torch.dtype,
-#     op_type: str,
-#     seed: int,
-#     device: str,
-# ):
-#     torch.manual_seed(seed)
-#     torch.set_default_device(device)
-#     if batchs == 0:
-#         batchs += 1
-#     hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
-#     hidden_size = HIDDEN_SIZES[hidden_size_index]
-#     if hidden_size > 100000:
-#         hidden_size = hidden_size // 4  # avoid OOM
-#     (
-#         inputs_tensor,
-#         lora_weights,
-#         our_out_tensor,
-#         ref_out_tensor,
-#         b_seq_start_loc,
-#         lora_indices_tensor,
-#         seq_len_tensor,
-#         indices,
-#     ) = _generate_data(
-#         batchs, hidden_size, num_loras, rank, 1024, dtype, op_type, device
-#     )  # The sequence length is restricted to the range [1, 1024].
-#     max_seq_length = seq_len_tensor.max()
-#     if isinstance(max_seq_length, tuple):
-#         max_seq_length = max_seq_length[0].item()
-#     else:
-#         max_seq_length = max_seq_length.item()
-#     if op_type == "shrink":
-#         sgmv_shrink(
-#             inputs_tensor,
-#             lora_weights,
-#             our_out_tensor,
-#             b_seq_start_loc,
-#             seq_len_tensor,
-#             lora_indices_tensor,
-#             batchs,
-#             max_seq_length,
-#             scaling,
-#         )
-#     else:
-#         sgmv_expand(
-#             inputs_tensor,
-#             lora_weights,
-#             our_out_tensor,
-#             b_seq_start_loc,
-#             seq_len_tensor,
-#             lora_indices_tensor,
-#             batchs,
-#             max_seq_length,
-#             add_inputs=True,
-#         )
-#     _torch_groupgemm(
-#         ref_out_tensor,
-#         inputs_tensor,
-#         lora_weights,
-#         lora_indices_tensor,
-#         seq_len_tensor,
-#         batchs,
-#         scaling if op_type == "shrink" else 1.0,
-#         op_type,
-#     )
-#     if op_type == "shrink":
-#         ref_out_tensor = ref_out_tensor.to(torch.float32)
-#     assert_close(our_out_tensor, ref_out_tensor)
+                                  seq_length, dtype, nslices, device):
+    try:
+        seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                       (batchs, )).to(device)
+        b_seq_start_loc = torch.cumsum(
+            torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+            dim=0,
+        ).to(device)
+        total_tokens = seq_len_tensor.sum()
+        inputs_tensor = torch.rand(
+            (total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        lora_weights_lst = []
+        for _ in range(nslices):
+            lora_weights_lst.append(
+                torch.rand(
+                    (lora_nums, hidden_size, max_rank),  # col-major
+                    dtype=dtype,
+                ).to(device))
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                    dtype=dtype).to(device)
+        # Ensure the same input.
+        our_out_tensor = ref_out_tensor.clone()
+        lora_indices_tensor = torch.randint(
+            0, lora_nums - 1 if lora_nums > 1 else 1, (batchs, ))
+        indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+        current_offset = 0
+        for b_id in range(batchs):
+            lora_index = lora_indices_tensor[b_id]
+            indices[current_offset:current_offset +
+                    seq_len_tensor[b_id]] = lora_index.item()
+            current_offset += seq_len_tensor[b_id].item()
+
+        lora_indices_tensor = lora_indices_tensor.to(device)
+        return (
+            inputs_tensor,
+            lora_weights_lst,
+            our_out_tensor,
+            ref_out_tensor,
+            b_seq_start_loc,
+            lora_indices_tensor,
+            seq_len_tensor,
+            indices,
+        )
+    except Exception as error:
+        raise error
 
 
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("batchs", BATCHS)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("scaling", SCALES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", OP_TYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_triton_sgmv_punica_bgmv(
-    hidden_size,
+def test_punica_sgmv(
+    batchs: int,
+    num_loras: int,
+    rank: int,
     scaling: float,
     dtype: torch.dtype,
     op_type: str,
     seed: int,
     device: str,
 ):
-    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
-    if dtype == torch.float32 or hidden_size == 3424:
-        return
-    torch.manual_seed(seed)
+    random.seed(seed)
     torch.set_default_device(device)
-    batchs = 4  # Arbitrary values for testing
-    rank = 16  # Arbitrary values for testing
-    seq_len = 128  # Arbitrary values for testing
-    num_loras = 8  # Arbitrary values for testing
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
+    hidden_size = HIDDEN_SIZES[hidden_size_index]
+    if hidden_size > 100000:
+        hidden_size = hidden_size // 4  # avoid OOM
+    seq_length = 128
     (
         inputs_tensor,
         lora_weights,
@@ -357,9 +260,8 @@ def test_triton_sgmv_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_length, dtype,
                        op_type, device)
-
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
         max_seq_length = max_seq_length[0].item()
@@ -389,13 +291,15 @@ def test_triton_sgmv_punica_bgmv(
             max_seq_length,
             add_inputs=True,
         )
-    lora_weights_4d = lora_weights.unsqueeze(dim=1)
-    _punica_bgmv(
+    _torch_groupgemm(
         ref_out_tensor,
         inputs_tensor,
-        lora_weights_4d,
-        indices,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batchs,
         scaling if op_type == "shrink" else 1.0,
+        op_type,
     )
     if op_type == "shrink":
         ref_out_tensor = ref_out_tensor.to(torch.float32)
@@ -403,31 +307,34 @@ def test_triton_sgmv_punica_bgmv(
 
 
 @pytest.mark.parametrize("batchs", BATCHS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("scaling", SCALES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", OP_TYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_triton_bgmv_punica_bgmv(
+def test_punica_bgmv(
     batchs: int,
-    hidden_size: int,
+    num_loras: int,
+    rank: int,
     scaling: float,
     dtype: torch.dtype,
     op_type: str,
     seed: int,
     device: str,
 ):
-    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
-    if dtype == torch.float32 or hidden_size == 3424:
-        return
-    torch.manual_seed(seed)
+    random.seed(seed)
     torch.set_default_device(device)
-    if batchs == 0:
-        batchs += 1
-    rank = 16
-    seq_len = 1  #
-    num_loras = 8  # Arbitrary values for testing
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
+    hidden_size = HIDDEN_SIZES[hidden_size_index]
+    if hidden_size > 100000:
+        hidden_size = hidden_size // 4  # avoid OOM
+    seq_length = 1
     (
         inputs_tensor,
         lora_weights,
@@ -437,15 +344,14 @@ def test_triton_bgmv_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_len, dtype,
+    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_length, dtype,
                        op_type, device)
-
     if op_type == "shrink":
         bgmv_shrink(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
-            lora_indices_tensor,
+            indices,
             scaling,
         )
     else:
@@ -453,16 +359,18 @@ def test_triton_bgmv_punica_bgmv(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
-            lora_indices_tensor,
+            indices,
             add_inputs=True,
         )
-    lora_weights_4d = lora_weights.unsqueeze(dim=1)
-    _punica_bgmv(
+    _torch_groupgemm(
         ref_out_tensor,
         inputs_tensor,
-        lora_weights_4d,
-        indices,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batchs,
         scaling if op_type == "shrink" else 1.0,
+        op_type,
     )
     if op_type == "shrink":
         ref_out_tensor = ref_out_tensor.to(torch.float32)
@@ -470,27 +378,33 @@ def test_triton_bgmv_punica_bgmv(
 
 
 @pytest.mark.parametrize("batchs", BATCHS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("nslices", NSLICES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sgmv_expand_slice(
+def test_punica_expand_nslices(
     batchs: int,
-    hidden_size: int,
+    num_loras: int,
+    rank: int,
     nslices: int,
-    dtype: str,
+    dtype: torch.dtype,
+    op_type: str,
     seed: int,
     device: str,
 ):
-    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
-    if dtype == torch.float32 or hidden_size == 3424:
-        return
-    torch.manual_seed(seed)
+    random.seed(seed)
     torch.set_default_device(device)
-    max_rank = 16
-    lora_nums = 4
-    max_length = 128
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
+    hidden_size = HIDDEN_SIZES[hidden_size_index]
+    if hidden_size > 100000:
+        hidden_size = hidden_size // 4  # avoid OOM
+    seq_length = 128 if op_type == "sgmv" else 1
     (
         inputs_tensor,
         lora_weights_lst,
@@ -503,9 +417,9 @@ def test_sgmv_expand_slice(
     ) = _generate_data_expand_nslices(
         batchs,
         hidden_size,
-        lora_nums,
-        max_rank,
-        max_length,
+        num_loras,
+        rank,
+        seq_length,
         dtype,
         nslices,
         device,
@@ -518,109 +432,48 @@ def test_sgmv_expand_slice(
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        sgmv_expand_slice(
+        if op_type == "sgmv":
+            sgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batchs,
+                max_seq_length,
+                slice_offset,
+                hidden_size,
+                add_inputs=True,
+            )
+        else:
+            bgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                indices,
+                slice_offset,
+                slice_size=hidden_size,
+                add_inputs=True,
+            )
+        _torch_groupgemm(
+            ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
             lora_weights,
-            our_outputs,
-            b_seq_start_loc,
-            seq_len_tensor,
             lora_indices_tensor,
+            seq_len_tensor,
             batchs,
-            max_seq_length,
-            slice_offset,
-            hidden_size,
-            add_inputs=True,
-        )
-        lora_weights_4d = lora_weights.unsqueeze(dim=1)
-        dispatch_bgmv_low_level(
-            ref_outputs,
-            inputs_tensor,
-            lora_weights_4d,
-            indices,
-            0,
             1.0,
-            inputs_tensor.size(1),
-            hidden_size,
-            slice_offset,
+            op_type="expand",
         )
-        slice_offset += hidden_size
-    assert_close(our_outputs, ref_outputs)
-
 
-@pytest.mark.parametrize("batchs", BATCHS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("nslices", NSLICES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_bgmv_expand_slice(
-    batchs: int,
-    hidden_size: int,
-    nslices: int,
-    dtype: str,
-    seed: int,
-    device: str,
-):
-    # avoid `No suitable kernel. h_in=xx h_out=xxxx ` error
-    if dtype == torch.float32 or hidden_size == 3424:
-        return
-    torch.manual_seed(seed)
-    torch.set_default_device(device)
-    max_rank = 64
-    lora_nums = 8
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_outputs,
-        ref_outputs,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = _generate_data_expand_nslices(
-        batchs,
-        hidden_size,
-        lora_nums,
-        max_rank,
-        1,
-        dtype,
-        nslices,
-        device,
-    )
-    slice_offset = 0
-    for index in range(nslices):
-        lora_weights = lora_weights_lst[index]
-        bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            our_outputs,
-            lora_indices_tensor,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-        lora_weights_4d = lora_weights.unsqueeze(dim=1)
-        dispatch_bgmv_low_level(
-            ref_outputs,
-            inputs_tensor,
-            lora_weights_4d,
-            lora_indices_tensor,
-            0,
-            1.0,
-            inputs_tensor.size(1),
-            hidden_size,
-            slice_offset,
-        )
         slice_offset += hidden_size
     assert_close(our_outputs, ref_outputs)
 
 
 if __name__ == "__main__":
-    test_bgmv_expand_slice(
-        batchs=32,
-        hidden_size=128,
-        nslices=2,
-        dtype=torch.bfloat16,
-        seed=0,
-        device="cuda:0",
-    )
+    # cuda:0-0-bgmv-dtype1-3-32-16-24
+    for _ in range(1000):
+        test_punica_expand_nslices(24, 16, 32, 3, torch.bfloat16, "bgmv", 0,
+                                   "cuda:0")
+        print("ssss")
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index ec68c6d20f98..998095c412e6 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -124,15 +124,11 @@ def bgmv_expand(
         lora_b_weights = lora_b_weights.squeeze(dim=1)
     else:
         assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-
     assert lora_b_weights.is_contiguous()
 
     # TODO tuning this config
-
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    # BLOCK_N =64
     BLOCK_K = triton.next_power_of_2(K)
-    # SPLIT_N = 8
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
@@ -142,7 +138,6 @@ def bgmv_expand(
     ]:
         CAST_TYPE = True
     batchs = lora_indices_tensor.size(0)
-
     if override_config:
         config = override_config
     else:
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index af343d6eae1c..071dbe40f216 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -110,7 +110,7 @@ def bgmv_expand_slice(
         slice_size (int): current output_tensor's size
         batchs (int): batch size
         add_inputs (bool, optional): Defaults to False.
-        override_config (Optional[Dict[str, int]], optional): Defaults to None. 
+        override_config (Optional[Dict[str, int]], optional): Defaults to None.
             Triton grid config
     """
 
@@ -138,6 +138,7 @@ def bgmv_expand_slice(
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
     # BLOCK_N = 256
     BLOCK_K = triton.next_power_of_2(K)
+
     # SPLIT_N = 64
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index 6b92ed72c4c2..3258a60d2455 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -48,7 +48,6 @@ def _bgmv_shrink_kernel(
     offset_k = tl.arange(0, BLOCK_K) + pid_sk * BLOCK_K
     a_ptr = input_ptr + cur_batch * xm_stride
     b_ptr = lora_ptr + l0_stride * lora_index
-    rank_mask = offset_n[:, None] < N
     accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32)
     for k in range(0, K, BLOCK_K * SPLIT_K):
         current_k = k + offset_k
@@ -59,7 +58,7 @@ def _bgmv_shrink_kernel(
             mask=current_k < K,
             other=0.0,
         )  # [BLOCK_K]
-        b_ptr_mask = (rank_mask < N) & (current_k[None, :] < K)
+        b_ptr_mask = (offset_n[:, None] < N) & (current_k[None, :] < K)
 
         tiled_b = tl.load(
             b_ptr + offset_n[:, None] * lora_k_stride +
@@ -119,7 +118,7 @@ def bgmv_shrink(
     # TODO tuning this config
     batchs = lora_indices_tensor.size(0)
     N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
-    BLOCK_N = triton.next_power_of_2(output_tensor.size(1))
+    BLOCK_N = triton.next_power_of_2(N)
     if override_config:
         config = override_config
     else:

From 0e7dde342be124a8367377b1a3007f5cf0a35480 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 26 Jun 2024 23:44:03 +0800
Subject: [PATCH 34/71] verify mem

---
 tests/lora/test_triton_punica.py | 8 --------
 vllm/lora/models.py              | 1 +
 vllm/lora/punica.py              | 1 +
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 56df321714a4..a5ccf847afe7 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -469,11 +469,3 @@ def test_punica_expand_nslices(
 
         slice_offset += hidden_size
     assert_close(our_outputs, ref_outputs)
-
-
-if __name__ == "__main__":
-    # cuda:0-0-bgmv-dtype1-3-32-16-24
-    for _ in range(1000):
-        test_punica_expand_nslices(24, 16, 32, 3, torch.bfloat16, "bgmv", 0,
-                                   "cuda:0")
-        print("ssss")
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 5010ac94d643..10ea1e69ce8a 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -615,6 +615,7 @@ def remove_all_loras(self):
         self._registered_loras.clear()
         self.lora_index_to_id = [None] * self.lora_slots
         self._active_loras.clear()
+        punica.reset_params_cache()
 
     def _create_lora_modules(self):
         for module_name, module in self.model.named_modules(
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 321fccc9df93..e0b441e1dd08 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -44,6 +44,7 @@ def reset_params_cache():
     cache explicitly
     """
     #TODO release gpu memory
+    torch.cuda.empty_cache()
     _PARAMS_CACHE.clear()
 
 
From 7419d19f457826ba52bb3582cb9ee18d31d2fccd Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 27 Jun 2024 01:07:41 +0800
Subject: [PATCH 35/71] Trigger CI


From 5fbb2a84c7beaba6a854ec2589b454b98faa64d0 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Sat, 29 Jun 2024 13:42:32 +0800
Subject: [PATCH 36/71] update

---
 tests/lora/test_triton_punica.py   | 58 +++++++++++++++---------------
 vllm/lora/layers.py                | 41 ++++++---------------
 vllm/lora/models.py                |  7 ++--
 vllm/lora/ops/bgmv_expand.py       |  8 ++---
 vllm/lora/ops/bgmv_expand_slice.py |  8 ++---
 vllm/lora/ops/bgmv_shrink.py       |  8 ++---
 vllm/lora/ops/sgmv_expand.py       | 10 +++---
 vllm/lora/ops/sgmv_expand_slice.py | 10 +++---
 vllm/lora/ops/sgmv_shrink.py       | 10 +++---
 vllm/lora/ops/utils.py             |  6 ++--
 vllm/lora/punica.py                |  2 +-
 11 files changed, 73 insertions(+), 95 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index a5ccf847afe7..eea190b153a1 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -62,7 +62,7 @@
     128256,
 ]
 
-BATCHS = [1, 2, 4] + [8 * i for i in range(1, 4)]
+batches = [1, 2, 4] + [8 * i for i in range(1, 4)]
 
 NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
 DTYPES = [torch.float16, torch.bfloat16]
@@ -87,13 +87,13 @@ def _torch_groupgemm(
     lora_weights,
     lora_indices_tensor,
     seq_len_tensor,
-    batchs,
+    batches,
     scaling,
     op_type,
 ) -> torch.Tensor:
     out_list = []
     current_offset = 0
-    for lora_index, b_length in zip(range(batchs), seq_len_tensor):
+    for lora_index, b_length in zip(range(batches), seq_len_tensor):
         input_weight = inputs[current_offset:b_length + current_offset, :]
         current_offset += b_length
         lora_weight = lora_weights[lora_indices_tensor[lora_index]]
@@ -108,10 +108,10 @@ def _torch_groupgemm(
     return
 
 
-def _generate_data(batchs, hidden_size, lora_nums, max_rank, seq_length, dtype,
-                   op_type, device):
+def _generate_data(batches, hidden_size, lora_nums, max_rank, seq_length,
+                   dtype, op_type, device):
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batchs, )).to(device)
+                                   (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
@@ -150,10 +150,10 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, seq_length, dtype,
         our_out_tensor = ref_out_tensor.clone()
     lora_indices_tensor = torch.randint(0,
                                         lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batchs, )).to(device)
+                                        (batches, )).to(device)
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
-    for b_id in range(batchs):
+    for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
         indices[current_offset:current_offset +
                 seq_len_tensor[b_id]].copy_(lora_index)
@@ -170,11 +170,11 @@ def _generate_data(batchs, hidden_size, lora_nums, max_rank, seq_length, dtype,
     )
 
 
-def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
+def _generate_data_expand_nslices(batches, hidden_size, lora_nums, max_rank,
                                   seq_length, dtype, nslices, device):
     try:
         seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                       (batchs, )).to(device)
+                                       (batches, )).to(device)
         b_seq_start_loc = torch.cumsum(
             torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
             dim=0,
@@ -198,10 +198,10 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
         # Ensure the same input.
         our_out_tensor = ref_out_tensor.clone()
         lora_indices_tensor = torch.randint(
-            0, lora_nums - 1 if lora_nums > 1 else 1, (batchs, ))
+            0, lora_nums - 1 if lora_nums > 1 else 1, (batches, ))
         indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
         current_offset = 0
-        for b_id in range(batchs):
+        for b_id in range(batches):
             lora_index = lora_indices_tensor[b_id]
             indices[current_offset:current_offset +
                     seq_len_tensor[b_id]] = lora_index.item()
@@ -222,7 +222,7 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
         raise error
 
 
-@pytest.mark.parametrize("batchs", BATCHS)
+@pytest.mark.parametrize("batches", batches)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("scaling", SCALES)
@@ -231,7 +231,7 @@ def _generate_data_expand_nslices(batchs, hidden_size, lora_nums, max_rank,
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_punica_sgmv(
-    batchs: int,
+    batches: int,
     num_loras: int,
     rank: int,
     scaling: float,
@@ -260,8 +260,8 @@ def test_punica_sgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_length, dtype,
-                       op_type, device)
+    ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length,
+                       dtype, op_type, device)
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
         max_seq_length = max_seq_length[0].item()
@@ -275,7 +275,7 @@ def test_punica_sgmv(
             b_seq_start_loc,
             seq_len_tensor,
             lora_indices_tensor,
-            batchs,
+            batches,
             max_seq_length,
             scaling,
         )
@@ -287,7 +287,7 @@ def test_punica_sgmv(
             b_seq_start_loc,
             seq_len_tensor,
             lora_indices_tensor,
-            batchs,
+            batches,
             max_seq_length,
             add_inputs=True,
         )
@@ -297,7 +297,7 @@ def test_punica_sgmv(
         lora_weights,
         lora_indices_tensor,
         seq_len_tensor,
-        batchs,
+        batches,
         scaling if op_type == "shrink" else 1.0,
         op_type,
     )
@@ -306,7 +306,7 @@ def test_punica_sgmv(
     assert_close(our_out_tensor, ref_out_tensor)
 
 
-@pytest.mark.parametrize("batchs", BATCHS)
+@pytest.mark.parametrize("batches", batches)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("scaling", SCALES)
@@ -315,7 +315,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_punica_bgmv(
-    batchs: int,
+    batches: int,
     num_loras: int,
     rank: int,
     scaling: float,
@@ -344,8 +344,8 @@ def test_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batchs, hidden_size, num_loras, rank, seq_length, dtype,
-                       op_type, device)
+    ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length,
+                       dtype, op_type, device)
     if op_type == "shrink":
         bgmv_shrink(
             inputs_tensor,
@@ -368,7 +368,7 @@ def test_punica_bgmv(
         lora_weights,
         lora_indices_tensor,
         seq_len_tensor,
-        batchs,
+        batches,
         scaling if op_type == "shrink" else 1.0,
         op_type,
     )
@@ -377,7 +377,7 @@ def test_punica_bgmv(
     assert_close(our_out_tensor, ref_out_tensor)
 
 
-@pytest.mark.parametrize("batchs", BATCHS)
+@pytest.mark.parametrize("batches", batches)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("nslices", [2, 3])
@@ -386,7 +386,7 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_punica_expand_nslices(
-    batchs: int,
+    batches: int,
     num_loras: int,
     rank: int,
     nslices: int,
@@ -415,7 +415,7 @@ def test_punica_expand_nslices(
         seq_len_tensor,
         indices,
     ) = _generate_data_expand_nslices(
-        batchs,
+        batches,
         hidden_size,
         num_loras,
         rank,
@@ -440,7 +440,7 @@ def test_punica_expand_nslices(
                 b_seq_start_loc,
                 seq_len_tensor,
                 lora_indices_tensor,
-                batchs,
+                batches,
                 max_seq_length,
                 slice_offset,
                 hidden_size,
@@ -462,7 +462,7 @@ def test_punica_expand_nslices(
             lora_weights,
             lora_indices_tensor,
             seq_len_tensor,
-            batchs,
+            batches,
             1.0,
             op_type="expand",
         )
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 5a612788f4a6..0fe87d1ff2e8 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -63,31 +63,6 @@ def dec(*args, **kwargs):
     return dec
 
 
-def _apply_expand(
-    x: torch.Tensor,
-    lora_b_stacked: torch.Tensor,
-    lora_index_tensor: torch.Tensor,
-    indices_info: List[int],
-    output: torch.Tensor,
-    add_input: bool = True,
-) -> torch.Tensor:
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    token_num = indices_info[0]
-    is_prefilling = bool(indices_info[5])
-    add_expand(
-        output,
-        x,
-        lora_b_stacked,
-        lora_index_tensor[:token_num],
-        0,
-        is_prefilling,
-        add_input,
-    )
-    return output.view_as(org_output)
-
-
 def _apply_lora(x: torch.Tensor,
                 lora_a_stacked: torch.Tensor,
                 lora_b_stacked: torch.Tensor,
@@ -118,9 +93,7 @@ def _apply_lora(x: torch.Tensor,
     org_output = output
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
-
     token_num = indices_info[0]
-
     is_prefilling = bool(indices_info[5])
     add_lora(output,
              x,
@@ -386,12 +359,18 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-        _apply_expand(
+        full_lora_a_embeddings = full_lora_a_embeddings.view(
+            -1, full_lora_a_embeddings.shape[-1])
+        full_output = full_output.view(-1, full_output.shape[-1])
+        token_num = self.indices_len[0]
+        is_prefilling = bool(self.indices_len[5])
+        add_expand(
+            full_output,
             full_lora_a_embeddings,
             self.lora_b_stacked,
-            self.indices,
-            self.indices_len,
-            full_output,
+            self.indices[:token_num],
+            0,
+            is_prefilling,
             add_input=True,
         )
         return full_output.view_as(full_output_org)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 32333f05b09d..24d95d6bb1b7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -79,9 +79,9 @@ def convert_mapping(
                 requests to RoPE offsets and rot dims for long LoRAs.
                 None if long context lora doesn't exist.
             indices_len: List of lengths of the above tensors and prefilling 
-                flag.Used to index into each tensor. It contains  
-                (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices,prefilling  flag). 
+                flag. Used to index into each tensor. It contains 
+                (base_indices, sampler_indices, sampler_indices_padded, 
+                embeddings_indices, long_lora_indices, prefilling flag). 
     """
     index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
     embedding_indices = index_mapping_indices.copy()
@@ -448,7 +448,6 @@ def __init__(
         # Dict instead of a Set for compatibility with LRUCache.
         self._active_loras: Dict[int, None] = {}
         self._last_mapping: Optional[LoRAMapping] = None
-        self._convert_flag = True
         self._create_lora_modules()
 
     @property
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 998095c412e6..576559beeffe 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -102,7 +102,7 @@ def bgmv_expand(
         output_tensor (torch.Tensor): output tensor
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
-        batchs (int): batch size
+        batches (int): batch size
         add_inputs (bool, optional):  Defaults to False. adds the final lora 
             results to the output.
         override_config (Optional[Dict[str, int]], optional): Defaults to None. 
@@ -137,14 +137,14 @@ def bgmv_expand(
             torch.bfloat16,
     ]:
         CAST_TYPE = True
-    batchs = lora_indices_tensor.size(0)
+    batches = lora_indices_tensor.size(0)
     if override_config:
         config = override_config
     else:
-        config = get_lora_op_configs("expand", batchs, N)
+        config = get_lora_op_configs("expand", batches, N)
     grid = lambda META: (
         META["SPLIT_N"],
-        batchs,
+        batches,
     )
     _bgmv_expand_kernel[grid](
         inputs,
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 071dbe40f216..24f2b93f4bf2 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -108,7 +108,7 @@ def bgmv_expand_slice(
             corresponding to each batch
         slice_offst (int): output_tensor's offst
         slice_size (int): current output_tensor's size
-        batchs (int): batch size
+        batches (int): batch size
         add_inputs (bool, optional): Defaults to False.
         override_config (Optional[Dict[str, int]], optional): Defaults to None.
             Triton grid config
@@ -149,16 +149,16 @@ def bgmv_expand_slice(
     ]:
         CAST_TYPE = True
 
-    batchs = lora_indices_tensor.size(0)
+    batches = lora_indices_tensor.size(0)
 
     if override_config:
         config = override_config
     else:
-        config = get_lora_op_configs("expand", batchs, N)
+        config = get_lora_op_configs("expand", batches, N)
 
     grid = lambda META: (
         META["SPLIT_N"],
-        batchs,
+        batches,
     )
     _bgmv_expand_slice_kernel[grid](
         inputs,
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index 3258a60d2455..85c36fd9ce04 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -94,7 +94,7 @@ def bgmv_shrink(
         output_tensor (torch.Tensor): output tensor
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
-        batchs (int): batch size
+        batches (int): batch size
         scaling (float):  Scaling factor.
         override_config (Optional[Dict[str, int]], optional): Defaults to None. 
             Triton grid config
@@ -116,18 +116,18 @@ def bgmv_shrink(
     assert lora_a_weights.is_contiguous()
     assert output_tensor.is_contiguous()
     # TODO tuning this config
-    batchs = lora_indices_tensor.size(0)
+    batches = lora_indices_tensor.size(0)
     N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
     BLOCK_N = triton.next_power_of_2(N)
     if override_config:
         config = override_config
     else:
         # First try to load optimal config from the file
-        config = get_lora_op_configs("shrink", batchs, K)
+        config = get_lora_op_configs("shrink", batches, K)
 
     grid = lambda META: (
         META["SPLIT_K"],
-        batchs,
+        batches,
     )
     _bgmv_shrink_kernel[grid](
         inputs,
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 879184db0b8b..f3a53b70f415 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -102,7 +102,7 @@ def sgmv_expand(
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batchs: int,
+    batches: int,
     max_seq_length: int,
     add_inputs: bool = False,
 ):
@@ -119,7 +119,7 @@ def sgmv_expand(
             length of the sequences  in the batch
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
-        batchs (int): batch size
+        batches (int): batch size
         max_seq_length (int):  The max sequence lengths of the sequences
             in the batch
         add_inputs (bool, optional):  Defaults to False. adds the final lora 
@@ -132,8 +132,8 @@ def sgmv_expand(
         torch.bfloat16,
     ]
     assert inputs.size(1) == lora_b_weights.size(-1)
-    assert b_seq_start_loc.size(0) == batchs
-    assert lora_indices_tensor.size(0) == batchs
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
     assert inputs.is_contiguous()
     assert output_tensor.is_contiguous()
 
@@ -161,7 +161,7 @@ def sgmv_expand(
         CAST_TYPE = True
     grid = [
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        batchs,
+        batches,
     ]
     _sgmv_expand_kernel[grid](
         inputs,
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 000fef304823..52c71c5095b5 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -108,7 +108,7 @@ def sgmv_expand_slice(
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batchs: int,
+    batches: int,
     max_seq_length: int,
     slice_offset: int,
     slice_size: int,
@@ -128,7 +128,7 @@ def sgmv_expand_slice(
             length of the sequences  in the batch
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
-        batchs (int): batch size
+        batches (int): batch size
         max_seq_length (int):  The max sequence lengths of the sequences
             in the batch
         slice_offst (int): output_tensor's offst
@@ -143,8 +143,8 @@ def sgmv_expand_slice(
         torch.bfloat16,
     ]
     assert inputs.size(1) == lora_b_weights.size(-1)
-    assert b_seq_start_loc.size(0) == batchs
-    assert lora_indices_tensor.size(0) == batchs
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
     assert slice_size == lora_b_weights.size(-2)
     assert inputs.is_contiguous()
     assert output_tensor.is_contiguous()
@@ -173,7 +173,7 @@ def sgmv_expand_slice(
         CAST_TYPE = True
     grid = [
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        batchs,
+        batches,
     ]
     _sgmv_expand_slice_kernel[grid](
         inputs,
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 45aeb9e9fb78..f295f0118f0b 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -105,7 +105,7 @@ def sgmv_shrink(
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
-    batchs: int,
+    batches: int,
     max_seq_length: int,
     scaling: float,
 ):
@@ -123,7 +123,7 @@ def sgmv_shrink(
             length of the sequences  in the batch
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
             corresponding to each batch
-        batchs (int): batch size
+        batches (int): batch size
         max_seq_length (int):  The max sequence lengths of the sequences
             in the batch
         scaling (float):  Scaling factor.
@@ -135,8 +135,8 @@ def sgmv_shrink(
         torch.bfloat16,
     ]
     assert inputs.size(1) == lora_a_weights.size(-1)
-    assert b_seq_start_loc.size(0) == batchs
-    assert lora_indices_tensor.size(0) == batchs
+    assert b_seq_start_loc.size(0) == batches
+    assert lora_indices_tensor.size(0) == batches
     assert inputs.is_contiguous()
 
     if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
@@ -156,7 +156,7 @@ def sgmv_shrink(
     grid = [
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         SPLIT_K,
-        batchs,
+        batches,
     ]
 
     _sgmv_shrink_kernel[grid](
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index 980dc8c6693f..78ba7c170353 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -8,11 +8,11 @@
 
 def _get_config_file_name(
     op_type: str,
-    batchs: int,
+    batches: int,
     hidden_size: int,
 ) -> str:
     device_name = torch.cuda.get_device_name().replace(" ", "_")
-    return (f"op_type={op_type},batchs={batchs},hidden_size={hidden_size} " +
+    return (f"op_type={op_type},batches={batches},hidden_size={hidden_size} " +
             f"device_name={device_name}.json")
 
 
@@ -29,7 +29,7 @@ def _get_op_configs(op_type: str, batch: int, hidden_size: int):
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
             tuned_config = json.load(f).get(
-                f"batchs={batch},hidden_size={hidden_size}", None)
+                f"batches={batch},hidden_size={hidden_size}", None)
             return tuned_config
 
     # If no optimized configuration is available, return None
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index e0b441e1dd08..aa96ba5f9240 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -44,8 +44,8 @@ def reset_params_cache():
     cache explicitly
     """
     #TODO release gpu memory
-    torch.cuda.empty_cache()
     _PARAMS_CACHE.clear()
+    torch.cuda.empty_cache()
 
 
 def _get_prefilling_params(token_lora_tensor: torch.Tensor,

From 7eebe1c8514a1e765279d85c148cfcc235364733 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 1 Jul 2024 14:59:25 +0800
Subject: [PATCH 37/71]  update docs

---
 vllm/lora/layers.py          |  6 +++---
 vllm/lora/ops/bgmv_shrink.py |  2 +-
 vllm/lora/ops/utils.py       | 38 ++++++++----------------------------
 3 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 0fe87d1ff2e8..9ae7050157fe 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -359,9 +359,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-        full_lora_a_embeddings = full_lora_a_embeddings.view(
-            -1, full_lora_a_embeddings.shape[-1])
-        full_output = full_output.view(-1, full_output.shape[-1])
+        # full_lora_a_embeddings = full_lora_a_embeddings.view(
+        #     -1, full_lora_a_embeddings.shape[-1])
+        # full_output = full_output.view(-1, full_output.shape[-1])
         token_num = self.indices_len[0]
         is_prefilling = bool(self.indices_len[5])
         add_expand(
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index 85c36fd9ce04..6e3d90e2d235 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -123,7 +123,7 @@ def bgmv_shrink(
         config = override_config
     else:
         # First try to load optimal config from the file
-        config = get_lora_op_configs("shrink", batches, K)
+        config = get_lora_op_configs("bgmv_shrink", batches, K)
 
     grid = lambda META: (
         META["SPLIT_K"],
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index 78ba7c170353..1755ac92b0d6 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -1,39 +1,10 @@
 import functools
-import json
-import os
 from typing import Dict
 
-import torch
-
-
-def _get_config_file_name(
-    op_type: str,
-    batches: int,
-    hidden_size: int,
-) -> str:
-    device_name = torch.cuda.get_device_name().replace(" ", "_")
-    return (f"op_type={op_type},batches={batches},hidden_size={hidden_size} " +
-            f"device_name={device_name}.json")
-
 
 @functools.lru_cache
 def _get_op_configs(op_type: str, batch: int, hidden_size: int):
-    FOLDER_NAME = "bgmv_configs"
-    json_file_name = _get_config_file_name(op_type, batch, hidden_size)
-
-    config_file_path = os.path.join(
-        os.path.dirname(os.path.realpath(__file__)),
-        FOLDER_NAME,
-        json_file_name,
-    )
-    if os.path.exists(config_file_path):
-        with open(config_file_path) as f:
-            tuned_config = json.load(f).get(
-                f"batches={batch},hidden_size={hidden_size}", None)
-            return tuned_config
-
-    # If no optimized configuration is available, return None
-    return None
+    raise NotImplementedError
 
 
 def _get_default_config(op_type: str, batch: int, hidden_size: int):
@@ -45,6 +16,13 @@ def _get_default_config(op_type: str, batch: int, hidden_size: int):
 
 def get_lora_op_configs(op_type: str, batch: int,
                         hidden_size: int) -> Dict[str, int]:
+    """Inspired by `fused_moe_kernel`
+    The return value will be a dictionary mapping an irregular grid of batch 
+    sizes and hidden_size to configurations of the bgmv-related kernel. 
+    NOTE: It currently only supports the default configuration. We plan to 
+    generate optimal configurations for different hardware in the future using 
+    scripts similar to `benchmark_moe.py`.
+    """
     config = _get_op_configs(op_type, batch, hidden_size)
     if not config:
         config = _get_default_config(op_type, batch, hidden_size)

From 8ac0331537bd096454f4b79398dc35a58913cf9d Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 1 Jul 2024 15:08:16 +0800
Subject: [PATCH 38/71]  update docs

---
 vllm/lora/punica.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index aa96ba5f9240..bf1c8b5a9a6c 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -43,7 +43,6 @@ def reset_params_cache():
     """At the beginning of the prefilling stage, we need  clear the
     cache explicitly
     """
-    #TODO release gpu memory
     _PARAMS_CACHE.clear()
     torch.cuda.empty_cache()
 

From ea4b3cdff320883091c1cc914697932515a563e4 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 1 Jul 2024 15:49:10 +0800
Subject: [PATCH 39/71] update docs

---
 vllm/lora/punica.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index bf1c8b5a9a6c..c023ebc51eb6 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -18,7 +18,12 @@ def _compute_params(
     token_lora_tensor: torch.Tensor
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:
     """
-    Get the information required for the sgmv kernel.
+    Get the information required for the sgmv kernel. With the  features:
+    1. If consecutive requests in the batch use the same LoRA, this function 
+    will combine them into a single request, improving sgmv kernel inference 
+    performance.
+    2. At the beginning of each prefilling stage inference, recalculations are 
+    needed based on the input, but only once. 
     """
     pointer = token_lora_tensor.data_ptr()
     if pointer not in _PARAMS_CACHE:

From 4a13f27396bfbf071a1e9858fd845a2d1ac98486 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 1 Jul 2024 17:07:53 +0800
Subject: [PATCH 40/71] fix bug

---
 vllm/lora/ops/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index 1755ac92b0d6..6a637288f71e 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -4,7 +4,8 @@
 
 @functools.lru_cache
 def _get_op_configs(op_type: str, batch: int, hidden_size: int):
-    raise NotImplementedError
+    # TODO: add optimal configurations
+    return None
 
 
 def _get_default_config(op_type: str, batch: int, hidden_size: int):

From a10f8bc548fa792852cb570b997f53e87cfb9af2 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 1 Jul 2024 22:26:25 +0800
Subject: [PATCH 41/71] reformat

---
 vllm/lora/punica.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 65afca18a850..c023ebc51eb6 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -13,6 +13,7 @@
 
 _PARAMS_CACHE: Dict[int, Tuple] = {}
 
+
 def _compute_params(
     token_lora_tensor: torch.Tensor
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:

From 3fb601676cf7d5eab6c2920c47473d88b050ecf9 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 2 Jul 2024 16:24:06 +0800
Subject: [PATCH 42/71] test lazy import

---
 vllm/lora/punica.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index c023ebc51eb6..a9b3040674d1 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,13 +4,6 @@
 
 import torch
 
-from vllm.lora.ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-
 _PARAMS_CACHE: Dict[int, Tuple] = {}
 
 
@@ -73,6 +66,9 @@ def add_shrink(
     y=x@w_t_all
     When `is_prefilling` is True, will launch `sgmv_shrink`
     """
+    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -110,6 +106,8 @@ def add_expand(
     y+=x@w_t_all
     When `is_prefilling` is True, will launch `sgmv_expand`, 
     """
+    from vllm.lora.ops.bgmv_expand import bgmv_expand
+    from vllm.lora.ops.sgmv_expand import sgmv_expand
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -148,6 +146,8 @@ def add_expand_slice(
     """
     y+=x@w_t_all
     """
+    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
     if is_prefilling:
         (
             b_seq_start_tensor,

From e49a5dc136c8ac2d6368ec1c77964acd1e9d5558 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 3 Jul 2024 23:27:11 +0800
Subject: [PATCH 43/71] merge

---
 vllm/lora/punica.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 2dd464552dfb..c023ebc51eb6 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,22 +4,16 @@
 
 import torch
 
-<<<<<<< HEAD
 from vllm.lora.ops.bgmv_expand import bgmv_expand
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
 from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-=======
-from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
->>>>>>> origin/main
 
 _PARAMS_CACHE: Dict[int, Tuple] = {}
 
 
-<<<<<<< HEAD
 def _compute_params(
     token_lora_tensor: torch.Tensor
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:
@@ -48,16 +42,6 @@ def _compute_params(
             max_length,
         )
     return _PARAMS_CACHE[pointer]
-=======
-    if current_platform.get_device_capability() < (8, 0):
-        raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0")
-    else:
-        raise ImportError(
-            "punica LoRA kernels could not be imported. If you built vLLM "
-            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set.")
->>>>>>> origin/main
 
 
 def reset_params_cache():

From 66dd88f41c7caf5ad940ed6017701b345a2a79e4 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 3 Jul 2024 23:34:53 +0800
Subject: [PATCH 44/71] merge main

---
 vllm/lora/punica.py         | 14 +++++++-------
 vllm/worker/model_runner.py |  7 ++-----
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index a9b3040674d1..c023ebc51eb6 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,6 +4,13 @@
 
 import torch
 
+from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+
 _PARAMS_CACHE: Dict[int, Tuple] = {}
 
 
@@ -66,9 +73,6 @@ def add_shrink(
     y=x@w_t_all
     When `is_prefilling` is True, will launch `sgmv_shrink`
     """
-    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -106,8 +110,6 @@ def add_expand(
     y+=x@w_t_all
     When `is_prefilling` is True, will launch `sgmv_expand`, 
     """
-    from vllm.lora.ops.bgmv_expand import bgmv_expand
-    from vllm.lora.ops.sgmv_expand import sgmv_expand
     if is_prefilling:
         (
             b_seq_start_tensor,
@@ -146,8 +148,6 @@ def add_expand_slice(
     """
     y+=x@w_t_all
     """
-    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
     if is_prefilling:
         (
             b_seq_start_tensor,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 16fc21ef3207..de550d472082 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1032,11 +1032,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         )
 
                     if self.lora_config:
-                        lora_mapping = LoRAMapping(
-                            [0] * batch_size,
-                            [0] * batch_size,
-                            False
-                        )
+                        lora_mapping = LoRAMapping([0] * batch_size,
+                                                   [0] * batch_size, False)
                         self.set_active_loras(set(), lora_mapping)
 
                     graph_runner = CUDAGraphRunner(

From 0cedeb34e533ba2e936fb607d433eed12c0bd3d0 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 4 Jul 2024 16:51:56 +0800
Subject: [PATCH 45/71] modify punica

---
 tests/lora/test_triton_punica.py  |  11 +-
 vllm/lora/fully_sharded_layers.py |  24 +--
 vllm/lora/layers.py               |  25 ++-
 vllm/lora/models.py               |  12 +-
 vllm/lora/punica.py               | 264 ++++++++++++++++++------------
 5 files changed, 192 insertions(+), 144 deletions(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index eea190b153a1..9bbc529188d8 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -62,11 +62,10 @@
     128256,
 ]
 
-batches = [1, 2, 4] + [8 * i for i in range(1, 4)]
-
+BATCHES = [1, 2, 4] + [8 * i for i in range(1, 4)]
 NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
 DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [8, 16, 32, 64]
+MAX_RANKS = [1]
 SCALES = [0.5]
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
@@ -222,7 +221,7 @@ def _generate_data_expand_nslices(batches, hidden_size, lora_nums, max_rank,
         raise error
 
 
-@pytest.mark.parametrize("batches", batches)
+@pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("scaling", SCALES)
@@ -306,7 +305,7 @@ def test_punica_sgmv(
     assert_close(our_out_tensor, ref_out_tensor)
 
 
-@pytest.mark.parametrize("batches", batches)
+@pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("scaling", SCALES)
@@ -377,7 +376,7 @@ def test_punica_bgmv(
     assert_close(our_out_tensor, ref_out_tensor)
 
 
-@pytest.mark.parametrize("batches", batches)
+@pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("nslices", [2, 3])
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 3ff69c930247..d21649bed5d6 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -65,7 +65,7 @@ def apply(self, x: torch.Tensor,
             device=x.device,
         )
         token_num = self.indices_len[0]
-        is_prefilling = bool(self.indices_len[5])
+        is_prefill = bool(self.indices_len[5])
         add_shrink(
             buffer,
             x,
@@ -73,7 +73,7 @@ def apply(self, x: torch.Tensor,
             self.indices[:token_num],
             0,
             1.0,
-            is_prefilling,
+            is_prefill,
         )
         buffer = tensor_model_parallel_all_gather(buffer)
         add_expand(
@@ -82,7 +82,7 @@ def apply(self, x: torch.Tensor,
             self.lora_b_stacked,
             self.indices[:token_num],
             0,
-            is_prefilling,
+            is_prefill,
             add_input=True,
         )
         # now have column partitioned output
@@ -130,7 +130,7 @@ def _mcp_apply(x, bias, layer):
         device=x.device,
     )
     token_num = layer.indices_len[0]
-    is_prefilling = bool(layer.indices_len[5])
+    is_prefill = bool(layer.indices_len[5])
     for idx in range(n):
 
         add_shrink(
@@ -140,7 +140,7 @@ def _mcp_apply(x, bias, layer):
             layer.indices[:token_num],
             0,
             1.0,
-            is_prefilling,
+            is_prefill,
         )
 
     buffers = tensor_model_parallel_all_gather(buffers)
@@ -153,7 +153,7 @@ def _mcp_apply(x, bias, layer):
             layer.lora_b_stacked[idx],
             layer.indices[:token_num],
             0,
-            is_prefilling,
+            is_prefill,
             left_offset,
             shard_size,
             add_input=True,
@@ -239,10 +239,10 @@ def apply(self, x: torch.Tensor,
                              device=x.device)
 
         token_num = self.indices_len[0]
-        is_prefilling = bool(self.indices_len[5])
+        is_prefill = bool(self.indices_len[5])
 
         add_shrink(buffer, x, self.lora_a_stacked, self.indices[:token_num], 0,
-                   1.0, is_prefilling)
+                   1.0, is_prefill)
         buffer = tensor_model_parallel_all_gather(buffer)
 
         add_expand(output,
@@ -250,7 +250,7 @@ def apply(self, x: torch.Tensor,
                    self.lora_b_stacked,
                    self.indices[:token_num],
                    0,
-                   is_prefilling,
+                   is_prefill,
                    add_input=True)
         # now have column partitioned output
 
@@ -346,7 +346,7 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
             device=x.device,
         )
         token_num = self.indices_len[0]
-        is_prefilling = bool(self.indices_len[5])
+        is_prefill = bool(self.indices_len[5])
         add_shrink(
             buffer,
             x,
@@ -354,7 +354,7 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
             self.indices[:token_num],
             0,
             1.0,
-            is_prefilling,
+            is_prefill,
         )
         buffer = tensor_model_parallel_all_reduce(buffer)
 
@@ -372,7 +372,7 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
             self.lora_b_stacked,
             self.indices[:self.indices_len[0]],
             0,
-            is_prefilling,
+            is_prefill,
             start_idx,
             shard_size,
         )
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 9cb122ee9547..632eb75f9699 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -83,7 +83,7 @@ def _apply_lora(x: torch.Tensor,
         index corresponding to each token
         indices_len(List):(6,), It contains  (base_indices, sampler_indices, 
             sampler_indices_padded,embeddings_indices, long_lora_indices,
-            prefilling flag). 
+            prefill flag). 
         output (torch.Tensor):  (batch_size, output_dim)
 
     Returns:
@@ -94,7 +94,7 @@ def _apply_lora(x: torch.Tensor,
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
     token_num = indices_info[0]
-    is_prefilling = bool(indices_info[5])
+    is_prefill = bool(indices_info[5])
     add_lora(output,
              x,
              lora_a_stacked,
@@ -102,7 +102,7 @@ def _apply_lora(x: torch.Tensor,
              lora_index_tensor[:token_num],
              0,
              1.0,
-             is_prefilling,
+             is_prefill,
              cache_clear=cache_clear)
     return output.view_as(org_output)
 
@@ -127,7 +127,7 @@ def _apply_lora_packed_nslice(x: torch.Tensor,
     output = output.view(-1, output.shape[-1])
 
     token_num = indices_info[0]
-    is_prefilling = bool(indices_info[5])
+    is_prefill = bool(indices_info[5])
     offset_left = 0
     # TODO fuse these kernels
     for slice_idx in range(len(output_slices)):
@@ -138,7 +138,7 @@ def _apply_lora_packed_nslice(x: torch.Tensor,
                  lora_index_tensor[:token_num],
                  0,
                  1.0,
-                 is_prefilling,
+                 is_prefill,
                  offset_left,
                  output_slices[slice_idx],
                  cache_clear=cache_clear)
@@ -153,8 +153,8 @@ class LoRAMapping:
     index_mapping: Tuple[int, ...]
     # Per sampled token:
     prompt_mapping: Tuple[int, ...]
-    # prefilling or  decoding.
-    is_prefilling: bool = False
+    # prefill stage or  decode stage.
+    is_prefill: bool = False
 
     def __post_init__(self):
         self.index_mapping = tuple(self.index_mapping)
@@ -363,14 +363,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         #     -1, full_lora_a_embeddings.shape[-1])
         # full_output = full_output.view(-1, full_output.shape[-1])
         token_num = self.indices_len[0]
-        is_prefilling = bool(self.indices_len[5])
+        is_prefill = bool(self.indices_len[5])
         add_expand(
             full_output,
             full_lora_a_embeddings,
             self.lora_b_stacked,
             self.indices[:token_num],
             0,
-            is_prefilling,
+            is_prefill,
             add_input=True,
         )
         return full_output.view_as(full_output_org)
@@ -1297,10 +1297,9 @@ def _get_logits(
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1], ] = lora_logits
 
-        # LogitsProcessorWithLoRA always using bgmv
-        # sampler_indices
         sampler_indices = self.indices_len[1]
-        is_prefilling = False
+        # LogitsProcessorWithLoRA always using bgmv
+        is_prefill = False
         add_lora(
             logits,
             hidden_states,
@@ -1309,7 +1308,7 @@ def _get_logits(
             self.indices[:sampler_indices],
             0,
             1.0,
-            is_prefilling,
+            is_prefill,
         )
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 4ef8f6115f0f..5324d50380dc 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -78,10 +78,10 @@ def convert_mapping(
             long_lora_indices: Tensor of shape [batch_size] mapping
                 requests to RoPE offsets and rot dims for long LoRAs.
                 None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors and prefilling 
+            indices_len: List of lengths of the above tensors and prefill
                 flag. Used to index into each tensor. It contains 
                 (base_indices, sampler_indices, sampler_indices_padded, 
-                embeddings_indices, long_lora_indices, prefilling flag). 
+                embeddings_indices, long_lora_indices, prefill flag). 
     """
     index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
     embedding_indices = index_mapping_indices.copy()
@@ -149,9 +149,9 @@ def convert_mapping(
     else:
         #If long_lora doesn't exist,append None
         indices_len.append(None)
-    # Append a prefilling flag to help selecting the appropriate lora
+    # Append a prefill flag to help selecting the appropriate lora
     # ops (sgmv or bgmv)
-    indices_len.append(int(mapping.is_prefilling))
+    indices_len.append(int(mapping.is_prefill))
     return (
         base_indices,
         sampler_indices,
@@ -458,7 +458,7 @@ def __init__(
         self.scaling_factor_to_offset: Dict[float, int] = {}
         # 6 is the number of indicies tensors.
         # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices,long_lora_indices,prefilling or decoding
+        # embeddings_indices,long_lora_indices,prefill or decode stage
         self.indices_len: List[Optional[int]] = [None] * 6
 
         self.model = model
@@ -622,7 +622,7 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
         # Maintain the reference
         self.indices_len[:] = indices_len
         #
-        if mapping.is_prefilling:
+        if mapping.is_prefill:
             punica.reset_params_cache()
             punica._compute_params(self.base_indices[:base_indices.shape[0]])
 
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index c023ebc51eb6..9474744ade2e 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -22,7 +22,7 @@ def _compute_params(
     1. If consecutive requests in the batch use the same LoRA, this function 
     will combine them into a single request, improving sgmv kernel inference 
     performance.
-    2. At the beginning of each prefilling stage inference, recalculations are 
+    2. At the beginning of each prefill stage inference, recalculations are 
     needed based on the input, but only once. 
     """
     pointer = token_lora_tensor.data_ptr()
@@ -45,20 +45,123 @@ def _compute_params(
 
 
 def reset_params_cache():
-    """At the beginning of the prefilling stage, we need  clear the
+    """At the beginning of the prefill stage, we need  clear the
     cache explicitly
     """
     _PARAMS_CACHE.clear()
     torch.cuda.empty_cache()
 
 
-def _get_prefilling_params(token_lora_tensor: torch.Tensor,
-                           cache_clear: bool = False):
+def _get_prefill_params(token_lora_tensor: torch.Tensor,
+                        cache_clear: bool = False):
     if cache_clear:
         reset_params_cache()
     return _compute_params(token_lora_tensor)
 
 
+def shrink_prefill(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    layer_idx: int,
+    scale: float,
+    cache_clear: bool = False,
+):
+    (
+        b_seq_start_tensor,
+        seq_length_tensor,
+        last_lora_indices_tensor,
+        batch_size,
+        max_length,
+    ) = _get_prefill_params(lora_indices_tensor, cache_clear)
+    sgmv_shrink(
+        x,
+        w_t_all,
+        y,
+        b_seq_start_tensor,
+        seq_length_tensor,
+        last_lora_indices_tensor,
+        batch_size,
+        max_length,
+        scale,
+    )
+
+
+def shrink_decode(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    layer_idx: int,
+    scale: float,
+):
+    bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale)
+
+
+def expand_prefill(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    layer_idx: int,
+    add_input: bool,
+    cache_clear: bool = False,
+):
+    (
+        b_seq_start_tensor,
+        seq_length_tensor,
+        last_lora_indices_tensor,
+        batch_size,
+        max_length,
+    ) = _get_prefill_params(lora_indices_tensor, cache_clear)
+    sgmv_expand(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor,
+                last_lora_indices_tensor, batch_size, max_length, add_input)
+
+
+def expand_decode(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    layer_idx: int,
+    add_input: bool,
+):
+    bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_input)
+
+
+def expand_slice_prefill(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    layer_idx: int,
+    y_offset: Optional[int],
+    y_slice_size: Optional[int],
+    add_input: bool,
+    cache_clear: bool = False,
+):
+    (
+        b_seq_start_tensor,
+        seq_length_tensor,
+        last_lora_indices_tensor,
+        batch_size,
+        max_length,
+    ) = _get_prefill_params(lora_indices_tensor, cache_clear)
+    sgmv_expand_slice(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor,
+                      last_lora_indices_tensor, batch_size, max_length,
+                      y_offset, y_slice_size, add_input)
+
+
+def expand_slice_decode(y: torch.Tensor, x: torch.Tensor,
+                        w_t_all: torch.Tensor,
+                        lora_indices_tensor: torch.Tensor, layer_idx: int,
+                        y_offset: Optional[int], y_slice_size: Optional[int],
+                        add_input: bool):
+    bgmv_expand_slice(x, w_t_all, y, lora_indices_tensor, y_offset,
+                      y_slice_size, add_input)
+
+
 def add_shrink(
     y: torch.Tensor,
     x: torch.Tensor,
@@ -66,34 +169,22 @@ def add_shrink(
     lora_indices_tensor: torch.Tensor,
     layer_idx: int,
     scale: float,
-    is_prefilling: bool,
+    is_prefill: bool,
     cache_clear: bool = False,
 ):
     """
-    y=x@w_t_all
-    When `is_prefilling` is True, will launch `sgmv_shrink`
+    Perform the ` y+=x@w_t_all` computation, which is suitable for the 
+    GEMM of lora'a.
+    When `is_prefill is` true, it indicates that it is currently the 
+    prefill stage, and the `shrink_prefill` function should be called. 
+    Otherwise, it is the decode stage, and the shrink_decode function 
+    should be called.
     """
-    if is_prefilling:
-        (
-            b_seq_start_tensor,
-            seq_length_tensor,
-            last_lora_indices_tensor,
-            batch_size,
-            max_length,
-        ) = _get_prefilling_params(lora_indices_tensor, cache_clear)
-        sgmv_shrink(
-            x,
-            w_t_all,
-            y,
-            b_seq_start_tensor,
-            seq_length_tensor,
-            last_lora_indices_tensor,
-            batch_size,
-            max_length,
-            scale,
-        )
+    if is_prefill:
+        shrink_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, scale,
+                       cache_clear)
     else:
-        bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale)
+        shrink_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, scale)
 
 
 def add_expand(
@@ -102,35 +193,23 @@ def add_expand(
     w_t_all: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
     layer_idx: int,
-    is_prefilling: bool,
+    is_prefill: bool,
     add_input: bool = True,
     cache_clear: bool = False,
 ):
     """
-    y+=x@w_t_all
-    When `is_prefilling` is True, will launch `sgmv_expand`, 
+    Perform the ` y+=x@w_t_all` computation, which is suitable for the 
+    GEMM of lora'b.
+    When `is_prefill` is true, it indicates that it is currently the 
+    prefill stage, and the `expand_prefill` function should be called. 
+    Otherwise, it is the decode stage, and the expand_decode function 
+    should be called.
     """
-    if is_prefilling:
-        (
-            b_seq_start_tensor,
-            seq_length_tensor,
-            last_lora_indices_tensor,
-            batch_size,
-            max_length,
-        ) = _get_prefilling_params(lora_indices_tensor, cache_clear)
-        sgmv_expand(
-            x,
-            w_t_all,
-            y,
-            b_seq_start_tensor,
-            seq_length_tensor,
-            last_lora_indices_tensor,
-            batch_size,
-            max_length,
-            add_input,
-        )
+    if is_prefill:
+        expand_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx,
+                       add_input, cache_clear)
     else:
-        bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_inputs=add_input)
+        expand_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, add_input)
 
 
 def add_expand_slice(
@@ -139,46 +218,21 @@ def add_expand_slice(
     w_t_all: torch.Tensor,
     lora_indices_tensor: torch.Tensor,
     layer_idx: int,
-    is_prefilling: bool,
+    is_prefill: bool,
     y_offset: Optional[int],
     y_slice_size: Optional[int],
     add_input: bool = True,
     cache_clear: bool = False,
 ):
     """
-    y+=x@w_t_all
+    Similar to `add_expand`
     """
-    if is_prefilling:
-        (
-            b_seq_start_tensor,
-            seq_length_tensor,
-            last_lora_indices_tensor,
-            batch_size,
-            max_length,
-        ) = _get_prefilling_params(lora_indices_tensor, cache_clear)
-        sgmv_expand_slice(
-            x,
-            w_t_all,
-            y,
-            b_seq_start_tensor,
-            seq_length_tensor,
-            last_lora_indices_tensor,
-            batch_size,
-            max_length,
-            y_offset,
-            y_slice_size,
-            add_input,
-        )
+    if is_prefill:
+        expand_slice_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx,
+                             y_offset, y_slice_size, add_input, cache_clear)
     else:
-        bgmv_expand_slice(
-            x,
-            w_t_all,
-            y,
-            lora_indices_tensor,
-            y_offset,
-            y_slice_size,
-            add_inputs=add_input,
-        )
+        expand_slice_decode(y, x, w_t_all, lora_indices_tensor, layer_idx,
+                            y_offset, y_slice_size, add_input)
 
 
 def add_lora(
@@ -189,7 +243,7 @@ def add_lora(
     lora_indices_tensor: torch.Tensor,
     layer_idx: int,
     scale: float,
-    is_prefilling: bool,
+    is_prefill: bool,
     y_offset: Optional[int] = None,
     y_slice_size: Optional[int] = None,
     *,
@@ -212,7 +266,7 @@ def add_lora(
         lora_indices_tensor (torch.Tensor): _description_
         layer_idx (int): Layer index of LoRA weights.
         scale (float): Scaling factor.
-        is_prefilling (bool): prefiling stage
+        is_prefill (bool): prefiling stage
         y_offset (Optional[int], optional): Offset to apply to the starting 
             column of y.
         y_slice_size (Optional[int], optional): Size of the y column slice..
@@ -235,30 +289,26 @@ def add_lora(
         lora_indices_tensor,
         0,
         scale,
-        is_prefilling,
+        is_prefill,
         cache_clear=cache_clear,
     )
     if y_offset is None and y_slice_size is None:
-        add_expand(
-            y,
-            buffer,
-            wb_t_all,
-            lora_indices_tensor,
-            0,
-            is_prefilling,
-            add_input=True,
-            cache_clear=cache_clear,
-        )
+        add_expand(y,
+                   buffer,
+                   wb_t_all,
+                   lora_indices_tensor,
+                   0,
+                   is_prefill,
+                   add_input=True,
+                   cache_clear=cache_clear)
     else:
-        add_expand_slice(
-            y,
-            buffer,
-            wb_t_all,
-            lora_indices_tensor,
-            0,
-            is_prefilling,
-            y_offset,
-            y_slice_size,
-            add_input=True,
-            cache_clear=cache_clear,
-        )
+        add_expand_slice(y,
+                         buffer,
+                         wb_t_all,
+                         lora_indices_tensor,
+                         0,
+                         is_prefill,
+                         y_offset,
+                         y_slice_size,
+                         add_input=True,
+                         cache_clear=cache_clear)

From 59d17f457ca7cb46cd16fdeb512597096f2ee385 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 9 Jul 2024 00:22:52 +0800
Subject: [PATCH 46/71] refactor sgmv metadata

---
 tests/lora/test_lora.py          |  14 +--
 tests/lora/test_triton_punica.py |   4 +-
 vllm/lora/layers.py              |  10 +-
 vllm/lora/models.py              |  14 ++-
 vllm/lora/punica.py              | 166 +++++++++++++++++++++----------
 5 files changed, 137 insertions(+), 71 deletions(-)

diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index 51708c8fa6e5..a4ca7a93e62e 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -20,7 +20,7 @@
     torch.bfloat16: (3e-2, 2e-2),
 }
 
-STAGES = [0, 1]  #prefilling(1) or decoding(0)
+STAGES = [0, 1]  #prefill stage(1) or decode stage(0)
 
 
 @pytest.mark.parametrize("m", TENSOR_SIZES)
@@ -68,7 +68,7 @@ def test_apply_lora(m, n, k, rank, dtype, stage) -> None:
                               device="cuda"),
                 indices_info,
                 output,
-                cache_clear=True)
+                need_update=True)
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
@@ -80,7 +80,7 @@ def test_apply_lora(m, n, k, rank, dtype, stage) -> None:
                 torch.full((len(input), ), -1, device="cuda"),
                 indices_info,
                 output,
-                cache_clear=True)
+                need_update=True)
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
@@ -149,7 +149,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None:
                                             device="cuda"),
                               indices_info,
                               output, (m // 2, m // 2),
-                              cache_clear=True)
+                              need_update=True)
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
@@ -161,7 +161,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None:
                               torch.full((len(input), ), -1, device="cuda"),
                               indices_info,
                               output, (m // 2, m // 2),
-                              cache_clear=True)
+                              need_update=True)
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
@@ -245,7 +245,7 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype, stage) -> None:
                                             device="cuda"),
                               indices_info,
                               output, (qkv[0], qkv[1], qkv[2]),
-                              cache_clear=True)
+                              need_update=True)
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
@@ -257,7 +257,7 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype, stage) -> None:
                               torch.full((len(input), ), -1, device="cuda"),
                               indices_info,
                               output, (qkv[0], qkv[1], qkv[2]),
-                              cache_clear=True)
+                              need_update=True)
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 9bbc529188d8..3ed2f032241e 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -62,10 +62,10 @@
     128256,
 ]
 
-BATCHES = [1, 2, 4] + [8 * i for i in range(1, 4)]
+BATCHES = [1, 2, 4] + [8 * i for i in range(1, 7)]
 NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
 DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [1]
+MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
 SCALES = [0.5]
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 632eb75f9699..57b0eb2347d8 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -69,7 +69,7 @@ def _apply_lora(x: torch.Tensor,
                 lora_index_tensor: torch.Tensor,
                 indices_info: List[int],
                 output: torch.Tensor,
-                cache_clear: bool = False) -> torch.Tensor:
+                need_update: bool = False) -> torch.Tensor:
     """Applies lora to each input. This method applies all loras to each
     input. It uses the `lora_index_tensor` vector to determine which lora
     yields the correct output. An index of -1 means no lora should be
@@ -85,6 +85,8 @@ def _apply_lora(x: torch.Tensor,
             sampler_indices_padded,embeddings_indices, long_lora_indices,
             prefill flag). 
         output (torch.Tensor):  (batch_size, output_dim)
+        need_update (bool, optional): Indicates whether updating sgmv metadata 
+            is needed. Defaults to False.
 
     Returns:
         output (torch.Tensor):  (batch_size*seq_number, output_dim)
@@ -103,7 +105,7 @@ def _apply_lora(x: torch.Tensor,
              0,
              1.0,
              is_prefill,
-             cache_clear=cache_clear)
+             need_update=need_update)
     return output.view_as(org_output)
 
 
@@ -116,7 +118,7 @@ def _apply_lora_packed_nslice(x: torch.Tensor,
                               indices_info: List[int],
                               output: torch.Tensor,
                               output_slices: Tuple[int, ...],
-                              cache_clear: bool = False) -> torch.Tensor:
+                              need_update: bool = False) -> torch.Tensor:
     """
     Applies lora to each input. Similar to _apply_lora, This method is 
     used for layers that are composed of multiple sublayers
@@ -141,7 +143,7 @@ def _apply_lora_packed_nslice(x: torch.Tensor,
                  is_prefill,
                  offset_left,
                  output_slices[slice_idx],
-                 cache_clear=cache_clear)
+                 need_update=need_update)
         offset_left += output_slices[slice_idx]
 
     return output.view_as(org_output)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 5324d50380dc..e18fdaa00e9a 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -12,11 +12,11 @@
 
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora import punica
 from vllm.lora.layers import (BaseLayerWithLoRA,
                               LinearScalingRotaryEmbeddingWithLora,
                               LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.punica import PrefillHelper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
@@ -26,6 +26,9 @@
 
 _GLOBAL_LORA_ID = 0
 
+# NOTE This value comes fromllm/worker/model_runner.py
+_MAX_BATCH_SIZE = 256
+
 
 @dataclass
 class LongContextLoRAContext:
@@ -460,6 +463,9 @@ def __init__(
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices,long_lora_indices,prefill or decode stage
         self.indices_len: List[Optional[int]] = [None] * 6
+        self.prefill_helper = PrefillHelper(max_batches=_MAX_BATCH_SIZE,
+                                            device=str(
+                                                self.base_indices.device))
 
         self.model = model
         if hasattr(self.model, "supported_lora_modules"):
@@ -621,10 +627,9 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
             self.long_lora_indices.zero_()
         # Maintain the reference
         self.indices_len[:] = indices_len
-        #
         if mapping.is_prefill:
-            punica.reset_params_cache()
-            punica._compute_params(self.base_indices[:base_indices.shape[0]])
+            self.prefill_helper.get_metadata(
+                self.base_indices[:base_indices.shape[0]], need_update=True)
 
     def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
         if self._last_mapping != lora_mapping:
@@ -643,7 +648,6 @@ def remove_all_loras(self):
         self._registered_loras.clear()
         self.lora_index_to_id = [None] * self.lora_slots
         self._active_loras.clear()
-        punica.reset_params_cache()
 
     def _create_lora_modules(self):
         for module_name, module in self.model.named_modules(
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 9474744ade2e..5347a68852a0 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -1,6 +1,11 @@
-# Based on code from https://github.com/punica-ai/punica
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
 
-from typing import Dict, Optional, Tuple
+from typing import Optional, Tuple
 
 import torch
 
@@ -11,12 +16,10 @@
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
-_PARAMS_CACHE: Dict[int, Tuple] = {}
 
-
-def _compute_params(
+def _compute_meta(
     token_lora_tensor: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:
     """
     Get the information required for the sgmv kernel. With the  features:
     1. If consecutive requests in the batch use the same LoRA, this function 
@@ -25,38 +28,94 @@ def _compute_params(
     2. At the beginning of each prefill stage inference, recalculations are 
     needed based on the input, but only once. 
     """
-    pointer = token_lora_tensor.data_ptr()
-    if pointer not in _PARAMS_CACHE:
-        lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
-            token_lora_tensor, return_counts=True)
-        cum_result = torch.cumsum(seq_length_tensor, dim=0)
-        b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
-        b_seq_start_tensor[1:].copy_(cum_result[:-1])
-        max_length = seq_length_tensor.max().item()
-        batch_size = lora_indices_tensor.size(0)
-        _PARAMS_CACHE[pointer] = (
-            b_seq_start_tensor,
-            seq_length_tensor,
-            lora_indices_tensor,
-            batch_size,
-            max_length,
-        )
-    return _PARAMS_CACHE[pointer]
-
-
-def reset_params_cache():
-    """At the beginning of the prefill stage, we need  clear the
-    cache explicitly
-    """
-    _PARAMS_CACHE.clear()
-    torch.cuda.empty_cache()
+
+    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+        token_lora_tensor, return_counts=True)
+    cum_result = torch.cumsum(seq_length_tensor, dim=0)
+    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+    b_seq_start_tensor[1:].copy_(cum_result[:-1])
+    max_length = seq_length_tensor.max().item()
+    batch_size = lora_indices_tensor.size(0)
+    return (
+        b_seq_start_tensor,
+        seq_length_tensor,
+        lora_indices_tensor,
+        batch_size,
+        max_length,
+    )
 
 
-def _get_prefill_params(token_lora_tensor: torch.Tensor,
-                        cache_clear: bool = False):
-    if cache_clear:
-        reset_params_cache()
-    return _compute_params(token_lora_tensor)
+class PrefillHelper:
+    """PrefillHelper is designed to manage and provide metadata for the sgmv 
+    kernel during  prefill stage, utilizing the singleton pattern to guarantee 
+    the existence of only one instance of the class.
+    """
+    _instance: Optional["PrefillHelper"] = None
+    initialized: bool
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance.initialized = False
+        return cls._instance
+
+    def __init__(self, max_batches: int = 256, device: str = "cuda"):
+        """
+        Args:
+            max_batches (int, optional):  the maximum batch to pre-allocate.
+                Defaults to 256.
+            device (str, optional): Defaults to "cuda".
+        """
+        if not self.initialized:
+            self.initialized = True
+            # these attributes are the information required for sgmv kernel
+            self.b_seq_start_tensor = torch.zeros(max_batches,
+                                                  dtype=torch.long,
+                                                  device=device)
+            self.seq_length_tensor = torch.empty(max_batches,
+                                                 dtype=torch.long,
+                                                 device=device)
+            self.lora_indices_tensor = torch.empty(max_batches,
+                                                   dtype=torch.long,
+                                                   device=device)
+            self.max_length: int = 0
+            self.batch_size: int = -1
+
+    def _update_metada(self, token_lora_tensor: torch.Tensor) -> None:
+
+        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+         batch_size, max_length) = _compute_meta(token_lora_tensor)
+
+        self.b_seq_start_tensor[:b_seq_start_tensor.shape[0]].copy_(
+            b_seq_start_tensor)
+        self.seq_length_tensor[:seq_length_tensor.shape[0]].copy_(
+            seq_length_tensor)
+        self.lora_indices_tensor[:lora_indices_tensor.shape[0]].copy_(
+            lora_indices_tensor)
+        self.batch_size = batch_size
+        self.max_length = max_length
+
+    def get_metadata(
+        self,
+        token_lora_tensor: torch.Tensor,
+        need_update: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:
+
+        #Need to recalculate and fill metadata.
+        if need_update:
+            self._update_metada(token_lora_tensor)
+
+        return (self.b_seq_start_tensor[:self.batch_size],
+                self.seq_length_tensor[:self.batch_size],
+                self.lora_indices_tensor[:self.batch_size], self.batch_size,
+                self.max_length)
+
+
+def get_prefill_meta(token_lora_tensor: torch.Tensor,
+                     need_update: bool = False):
+    prefill_helper = PrefillHelper(max_batches=256,
+                                   device=str(token_lora_tensor.device))
+    return prefill_helper.get_metadata(token_lora_tensor, need_update)
 
 
 def shrink_prefill(
@@ -66,7 +125,7 @@ def shrink_prefill(
     lora_indices_tensor: torch.Tensor,
     layer_idx: int,
     scale: float,
-    cache_clear: bool = False,
+    need_update: bool = False,
 ):
     (
         b_seq_start_tensor,
@@ -74,7 +133,7 @@ def shrink_prefill(
         last_lora_indices_tensor,
         batch_size,
         max_length,
-    ) = _get_prefill_params(lora_indices_tensor, cache_clear)
+    ) = get_prefill_meta(lora_indices_tensor, need_update)
     sgmv_shrink(
         x,
         w_t_all,
@@ -106,7 +165,7 @@ def expand_prefill(
     lora_indices_tensor: torch.Tensor,
     layer_idx: int,
     add_input: bool,
-    cache_clear: bool = False,
+    need_update: bool = False,
 ):
     (
         b_seq_start_tensor,
@@ -114,7 +173,7 @@ def expand_prefill(
         last_lora_indices_tensor,
         batch_size,
         max_length,
-    ) = _get_prefill_params(lora_indices_tensor, cache_clear)
+    ) = get_prefill_meta(lora_indices_tensor, need_update)
     sgmv_expand(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor,
                 last_lora_indices_tensor, batch_size, max_length, add_input)
 
@@ -139,7 +198,7 @@ def expand_slice_prefill(
     y_offset: Optional[int],
     y_slice_size: Optional[int],
     add_input: bool,
-    cache_clear: bool = False,
+    need_update: bool = False,
 ):
     (
         b_seq_start_tensor,
@@ -147,7 +206,7 @@ def expand_slice_prefill(
         last_lora_indices_tensor,
         batch_size,
         max_length,
-    ) = _get_prefill_params(lora_indices_tensor, cache_clear)
+    ) = get_prefill_meta(lora_indices_tensor, need_update)
     sgmv_expand_slice(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor,
                       last_lora_indices_tensor, batch_size, max_length,
                       y_offset, y_slice_size, add_input)
@@ -170,7 +229,7 @@ def add_shrink(
     layer_idx: int,
     scale: float,
     is_prefill: bool,
-    cache_clear: bool = False,
+    need_update: bool = False,
 ):
     """
     Perform the ` y+=x@w_t_all` computation, which is suitable for the 
@@ -182,7 +241,7 @@ def add_shrink(
     """
     if is_prefill:
         shrink_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, scale,
-                       cache_clear)
+                       need_update)
     else:
         shrink_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, scale)
 
@@ -195,7 +254,7 @@ def add_expand(
     layer_idx: int,
     is_prefill: bool,
     add_input: bool = True,
-    cache_clear: bool = False,
+    need_update: bool = False,
 ):
     """
     Perform the ` y+=x@w_t_all` computation, which is suitable for the 
@@ -207,7 +266,7 @@ def add_expand(
     """
     if is_prefill:
         expand_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx,
-                       add_input, cache_clear)
+                       add_input, need_update)
     else:
         expand_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, add_input)
 
@@ -222,14 +281,14 @@ def add_expand_slice(
     y_offset: Optional[int],
     y_slice_size: Optional[int],
     add_input: bool = True,
-    cache_clear: bool = False,
+    need_update: bool = False,
 ):
     """
     Similar to `add_expand`
     """
     if is_prefill:
         expand_slice_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx,
-                             y_offset, y_slice_size, add_input, cache_clear)
+                             y_offset, y_slice_size, add_input, need_update)
     else:
         expand_slice_decode(y, x, w_t_all, lora_indices_tensor, layer_idx,
                             y_offset, y_slice_size, add_input)
@@ -248,7 +307,7 @@ def add_lora(
     y_slice_size: Optional[int] = None,
     *,
     buffer: Optional[torch.Tensor] = None,
-    cache_clear: bool = False,
+    need_update: bool = False,
 ):
     """
     Semantics:
@@ -271,7 +330,8 @@ def add_lora(
             column of y.
         y_slice_size (Optional[int], optional): Size of the y column slice..
         buffer (Optional[torch.Tensor], optional): Defaults to None.
-        cache_clear (bool, optional):  Defaults to False.
+        need_update (bool, optional): Indicates whether updating sgmv metadata 
+            is needed. Defaults to False.
     """
 
     r = wb_t_all.size(-1)
@@ -290,7 +350,7 @@ def add_lora(
         0,
         scale,
         is_prefill,
-        cache_clear=cache_clear,
+        need_update=need_update,
     )
     if y_offset is None and y_slice_size is None:
         add_expand(y,
@@ -300,7 +360,7 @@ def add_lora(
                    0,
                    is_prefill,
                    add_input=True,
-                   cache_clear=cache_clear)
+                   need_update=need_update)
     else:
         add_expand_slice(y,
                          buffer,
@@ -311,4 +371,4 @@ def add_lora(
                          y_offset,
                          y_slice_size,
                          add_input=True,
-                         cache_clear=cache_clear)
+                         need_update=need_update)

From 46486972eb2fac8da50eaa4c26d01dfdf7924c35 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 9 Jul 2024 16:02:49 +0800
Subject: [PATCH 47/71] fix typo

---
 vllm/lora/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e18fdaa00e9a..cf26fe66ff41 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -26,7 +26,7 @@
 
 _GLOBAL_LORA_ID = 0
 
-# NOTE This value comes fromllm/worker/model_runner.py
+# NOTE This value comes from vllm/worker/model_runner.py
 _MAX_BATCH_SIZE = 256
 
 
From 8732c76e30c6485e04b08edc6768b6d1dffe7eab Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 10 Jul 2024 16:46:56 +0800
Subject: [PATCH 48/71] refactor punica wrapper

---
 vllm/lora/fully_sharded_layers.py  |  95 +---
 vllm/lora/layers.py                | 283 ++--------
 vllm/lora/models.py                | 192 +------
 vllm/lora/ops/sgmv_expand.py       |   4 +-
 vllm/lora/ops/sgmv_expand_slice.py |   4 +-
 vllm/lora/ops/sgmv_shrink.py       |   4 +-
 vllm/lora/punica.py                | 827 ++++++++++++++++++-----------
 7 files changed, 589 insertions(+), 820 deletions(-)

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index d21649bed5d6..cae7d593f123 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -14,7 +14,8 @@
                               MergedQKVParallelLinearWithLora,
                               QKVParallelLinearWithLora,
                               RowParallelLinearWithLoRA)
-from vllm.lora.punica import add_expand, add_expand_slice, add_shrink
+
+# from vllm.lora.punica import add_expand, add_expand_slice, add_shrink
 
 if TYPE_CHECKING:
     pass
@@ -64,27 +65,12 @@ def apply(self, x: torch.Tensor,
             dtype=torch.float32,
             device=x.device,
         )
-        token_num = self.indices_len[0]
-        is_prefill = bool(self.indices_len[5])
-        add_shrink(
-            buffer,
-            x,
-            self.lora_a_stacked,
-            self.indices[:token_num],
-            0,
-            1.0,
-            is_prefill,
-        )
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_gather(buffer)
-        add_expand(
-            output,
-            buffer,
-            self.lora_b_stacked,
-            self.indices[:token_num],
-            0,
-            is_prefill,
-            add_input=True,
-        )
+        self.punica_wrapper.add_expand(output,
+                                       buffer,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         # now have column partitioned output
         output = output.view(*out_orig_shape)
         return output
@@ -108,7 +94,7 @@ def can_replace_layer(
         )
 
 
-def _mcp_apply(x, bias, layer):
+def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
     """
     MergedColumnParallelLinearWithShardedLoRA and
     QKVParallelLinearWithShardedLora share the same
@@ -129,31 +115,18 @@ def _mcp_apply(x, bias, layer):
         dtype=torch.float32,
         device=x.device,
     )
-    token_num = layer.indices_len[0]
-    is_prefill = bool(layer.indices_len[5])
     for idx in range(n):
-
-        add_shrink(
-            buffers[idx],
-            x,
-            layer.lora_a_stacked[idx],
-            layer.indices[:token_num],
-            0,
-            1.0,
-            is_prefill,
-        )
+        layer.punica_wrapper.add_shrink(buffers[idx], x,
+                                        layer.lora_a_stacked[idx], 1.0)
 
     buffers = tensor_model_parallel_all_gather(buffers)
     left_offset = 0
     for idx in range(n):
         shard_size = layer.lora_b_stacked[idx].shape[2]
-        add_expand_slice(
+        layer.punica_wrapper.add_expand_slice(
             output,
             buffers[idx],
             layer.lora_b_stacked[idx],
-            layer.indices[:token_num],
-            0,
-            is_prefill,
             left_offset,
             shard_size,
             add_input=True,
@@ -237,23 +210,13 @@ def apply(self, x: torch.Tensor,
         buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
                              dtype=torch.float32,
                              device=x.device)
-
-        token_num = self.indices_len[0]
-        is_prefill = bool(self.indices_len[5])
-
-        add_shrink(buffer, x, self.lora_a_stacked, self.indices[:token_num], 0,
-                   1.0, is_prefill)
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_gather(buffer)
-
-        add_expand(output,
-                   buffer,
-                   self.lora_b_stacked,
-                   self.indices[:token_num],
-                   0,
-                   is_prefill,
-                   add_input=True)
+        self.punica_wrapper.add_expand(output,
+                                       buffer,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         # now have column partitioned output
-
         output = output.view(*out_orig_shape)
         return output
 
@@ -345,17 +308,8 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
             dtype=torch.float32,
             device=x.device,
         )
-        token_num = self.indices_len[0]
-        is_prefill = bool(self.indices_len[5])
-        add_shrink(
-            buffer,
-            x,
-            self.lora_a_stacked,
-            self.indices[:token_num],
-            0,
-            1.0,
-            is_prefill,
-        )
+
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -366,16 +320,9 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # reduced before being used
         shard_size = self.lora_b_stacked.shape[2]
         start_idx = self.tp_rank * shard_size
-        add_expand_slice(
-            output,
-            buffer,
-            self.lora_b_stacked,
-            self.indices[:self.indices_len[0]],
-            0,
-            is_prefill,
-            start_idx,
-            shard_size,
-        )
+        self.punica_wrapper.add_expand_slice(output, buffer,
+                                             self.lora_b_stacked, start_idx,
+                                             shard_size)
         output = output.view(*out_orig_shape)
         return output
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 57b0eb2347d8..16d086f2e8a1 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -16,8 +16,7 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-# from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.punica import add_expand, add_lora
+from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -63,92 +62,6 @@ def dec(*args, **kwargs):
     return dec
 
 
-def _apply_lora(x: torch.Tensor,
-                lora_a_stacked: torch.Tensor,
-                lora_b_stacked: torch.Tensor,
-                lora_index_tensor: torch.Tensor,
-                indices_info: List[int],
-                output: torch.Tensor,
-                need_update: bool = False) -> torch.Tensor:
-    """Applies lora to each input. This method applies all loras to each
-    input. It uses the `lora_index_tensor` vector to determine which lora
-    yields the correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the output.
-
-    Args:
-        x (torch.Tensor): (batch_size, hidden_dim)
-        lora_a_stacked (torch.Tensor): (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked (torch.Tensor): (num_loras, output_dim, lora_rank)
-        lora_index_tensor (torch.Tensor): (batch_size*seq_number,). The LoRA
-        index corresponding to each token
-        indices_len(List):(6,), It contains  (base_indices, sampler_indices, 
-            sampler_indices_padded,embeddings_indices, long_lora_indices,
-            prefill flag). 
-        output (torch.Tensor):  (batch_size, output_dim)
-        need_update (bool, optional): Indicates whether updating sgmv metadata 
-            is needed. Defaults to False.
-
-    Returns:
-        output (torch.Tensor):  (batch_size*seq_number, output_dim)
-
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    token_num = indices_info[0]
-    is_prefill = bool(indices_info[5])
-    add_lora(output,
-             x,
-             lora_a_stacked,
-             lora_b_stacked,
-             lora_index_tensor[:token_num],
-             0,
-             1.0,
-             is_prefill,
-             need_update=need_update)
-    return output.view_as(org_output)
-
-
-def _apply_lora_packed_nslice(x: torch.Tensor,
-                              lora_a_stacked: Tuple[torch.Tensor, torch.Tensor,
-                                                    torch.Tensor],
-                              lora_b_stacked: Tuple[torch.Tensor, torch.Tensor,
-                                                    torch.Tensor],
-                              lora_index_tensor: torch.Tensor,
-                              indices_info: List[int],
-                              output: torch.Tensor,
-                              output_slices: Tuple[int, ...],
-                              need_update: bool = False) -> torch.Tensor:
-    """
-    Applies lora to each input. Similar to _apply_lora, This method is 
-    used for layers that are composed of multiple sublayers
-    (slices) packed together.
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-
-    token_num = indices_info[0]
-    is_prefill = bool(indices_info[5])
-    offset_left = 0
-    # TODO fuse these kernels
-    for slice_idx in range(len(output_slices)):
-        add_lora(output,
-                 x,
-                 lora_a_stacked[slice_idx],
-                 lora_b_stacked[slice_idx],
-                 lora_index_tensor[:token_num],
-                 0,
-                 1.0,
-                 is_prefill,
-                 offset_left,
-                 output_slices[slice_idx],
-                 need_update=need_update)
-        offset_left += output_slices[slice_idx]
-
-    return output.view_as(org_output)
-
-
 @dataclass
 class LoRAMapping:
     # Per every token in input_ids:
@@ -202,15 +115,9 @@ def set_lora(
 
     def set_mapping(
         self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
+        punica_wrapper: PunicaWrapper,
     ):
-        """Sets the mapping indices."""
-        ...
+        self.punica_wrapper: PunicaWrapper = punica_wrapper
 
     @classmethod
     def can_replace_layer(
@@ -288,10 +195,6 @@ def create_lora_weights(
             self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
             self.lora_a_stacked.shape[2],
         )
-        # Lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-        self.embeddings_indices: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -326,28 +229,15 @@ def set_lora(
                 assert self.embeddings_weights is not None
                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.embeddings_indices = embeddings_indices
-        self.indices_len = indices_len
-
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
-        embedding_len = self.indices_len[3]
-        indices = self.embeddings_indices[1][:embedding_len].view_as(x)
+        embeddings_indices = self.punica_wrapper.embeddings_indices
+        indices = embeddings_indices[1].view_as(x)
         full_lora_a_embeddings = F.embedding(
             x + indices,
             self.lora_a_stacked_2d,
         )
-        indices = self.embeddings_indices[0][:embedding_len].view_as(x)
+        indices = embeddings_indices[0].view_as(x)
         full_output = self.base_layer.forward(
             x.add_(indices * added_tokens_mask))
 
@@ -361,20 +251,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-        # full_lora_a_embeddings = full_lora_a_embeddings.view(
-        #     -1, full_lora_a_embeddings.shape[-1])
-        # full_output = full_output.view(-1, full_output.shape[-1])
-        token_num = self.indices_len[0]
-        is_prefill = bool(self.indices_len[5])
-        add_expand(
-            full_output,
-            full_lora_a_embeddings,
-            self.lora_b_stacked,
-            self.indices[:token_num],
-            0,
-            is_prefill,
-            add_input=True,
-        )
+
+        # Embedding layer only need expand op
+        self.punica_wrapper.add_expand(full_output,
+                                       full_lora_a_embeddings,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         return full_output.view_as(full_output_org)
 
     @classmethod
@@ -432,10 +314,6 @@ def create_lora_weights(
         )
         self.output_dim = self.lora_b_stacked.shape[2]
 
-        # lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
@@ -471,29 +349,11 @@ def set_lora(
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.indices_len = indices_len
-
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices,
-            self.indices_len,
-            output,
-        )
+        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
+                                     self.lora_b_stacked, 1.0)
         return output
 
     def forward(self, input_):
@@ -587,9 +447,6 @@ def create_lora_weights(
             ) for _ in range(n_slices))
 
         self.output_dim = self.lora_b_stacked[0].shape[2]
-        # Lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
@@ -647,15 +504,9 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_packed_nslice(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices,
-            self.indices_len,
-            output,
-            (self.output_dim, self.output_dim),
-        )
+        self.punica_wrapper.add_lora_packed_nslice(
+            output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0,
+            (self.output_dim, self.output_dim))
         return output
 
     @classmethod
@@ -917,16 +768,10 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        _apply_lora_packed_nslice(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices,
-            self.indices_len,
-            output,
-            self.output_slices,
-        )
-
+        self.punica_wrapper.add_lora_packed_nslice(output, x,
+                                                   self.lora_a_stacked,
+                                                   self.lora_b_stacked, 1.0,
+                                                   self.output_slices)
         return output
 
     @classmethod
@@ -984,9 +829,6 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-        # Lazily initialized
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -1023,29 +865,10 @@ def set_lora(
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = base_indices
-        self.indices_len = indices_len
-
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
-        # maybe we need not  restrict  range to [:batch_size]
-        _apply_lora(
-            x,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices,
-            self.indices_len,
-            output,
-        )
+        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
+                                     self.lora_b_stacked, 1.0)
         return output
 
     def forward(self, input_):
@@ -1200,10 +1023,6 @@ def create_lora_weights(
                 dtype=torch.long)
         else:
             self.sharded_to_full_mapping_gpu = None
-        # Lazily initialized.
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-        self.indices_padded: torch.Tensor
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
@@ -1229,19 +1048,6 @@ def set_lora(
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
                 shape[1], ] = embeddings_tensor
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.indices = sampler_indices
-        self.indices_padded = sampler_indices_padded
-        self.indices_len = indices_len
-
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
@@ -1287,34 +1093,24 @@ def _get_logits(
                      out=lora_logits[:-1])
         lora_logits[-1] = float("-inf")
         lora_logits = lora_logits.mT
+        indices_padded = self.punica_wrapper.sampler_indices_padded
         lora_logits = (lora_logits.reshape(
             lora_logits.shape[0] * lora_logits.shape[1],
             lora_logits.shape[2],
-        ).index_select(0,
-                       self.indices_padded[:self.indices_len[2]]).nan_to_num_(
-                           nan=float("-inf"),
-                           posinf=float("inf"),
-                           neginf=float("-inf")))
+        ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
+                                                      posinf=float("inf"),
+                                                      neginf=float("-inf")))
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1], ] = lora_logits
 
-        sampler_indices = self.indices_len[1]
         # LogitsProcessorWithLoRA always using bgmv
-        is_prefill = False
-        add_lora(
-            logits,
-            hidden_states,
-            self.lora_a_stacked,
-            self.lora_b_stacked,
-            self.indices[:sampler_indices],
-            0,
-            1.0,
-            is_prefill,
-        )
+        self.punica_wrapper.add_lora_logits(logits, hidden_states,
+                                            self.lora_a_stacked,
+                                            self.lora_b_stacked, 1.0)
+
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
-
         return logits
 
     def forward(self, *args, **kwargs):
@@ -1343,9 +1139,6 @@ class LinearScalingRotaryEmbeddingWithLora(BaseLayerWithLoRA):
     def __init__(self, base_layer: RotaryEmbedding) -> None:
         super().__init__()
         self.base_layer = base_layer
-        # Lazily initialized
-        self.long_lora_indices: torch.Tensor
-        self.indices_len: List[int]
 
     @property
     def scaling_factors(self):
@@ -1389,18 +1182,6 @@ def set_lora(
     ):
         ...
 
-    def set_mapping(
-        self,
-        base_indices: torch.Tensor,
-        sampler_indices: torch.Tensor,
-        sampler_indices_padded: torch.Tensor,
-        embeddings_indices: torch.Tensor,
-        long_lora_indices: torch.Tensor,
-        indices_len: List[int],
-    ):
-        self.long_lora_indices = long_lora_indices
-        self.indices_len = indices_len
-
     def forward(
         self,
         positions: torch.Tensor,
@@ -1411,7 +1192,7 @@ def forward(
             positions,
             query,
             key,
-            offsets=self.long_lora_indices[:self.indices_len[4]],
+            offsets=self.punica_wrapper.long_lora_indices,
         )
 
     @property
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index cf26fe66ff41..d743f1c52bf7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Callable, Dict, List, Optional, Type
 
 import safetensors.torch
 import torch
@@ -16,7 +16,7 @@
                               LinearScalingRotaryEmbeddingWithLora,
                               LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.punica import PrefillHelper
+from vllm.lora.punica import PunicaWrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
@@ -43,128 +43,6 @@ class LongContextLoRAContext:
     offsets_by_lora_id: Dict[int, int] = field(default_factory=dict)
 
 
-def convert_mapping(
-    mapping: LoRAMapping,
-    lora_index_to_id: List[Optional[int]],
-    max_loras: int,
-    vocab_size: int,
-    extra_vocab_size: int,
-    long_lora_context: Optional[LongContextLoRAContext] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor], List[int], ]:
-    """Converts LoRAMapping to index tensors.
-
-    Args:
-        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
-        lora_index_to_id: List mapping LoRA ids to LoRA indices.
-        max_loras: Maximum number of LoRAs.
-        vocab_size: Model vocab size.
-        extra_vocab_size: Extra vocab size each LoRA can have.
-        long_lora_context: Passed if there are long context lora in a batch.
-
-    Returns:
-        A tuple of tensors:
-            base_indices: Tensor of shape [batch_size] mapping batch rows to
-                LoRA indices.
-            sampler_indices: Tensor of shape [batch_size] mapping requests to
-                LoRA indices for sampler. For generation, this will be the
-                same as base_indicies. For prefill, this will map requests
-                to LoRA indices.
-            sampler_indices_padded: Tensor of shape [batch_size] mapping
-                requests to LoRA indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
-                max_loras.
-            embeddings_indices: Tensor of shape [2, batch_size] mapping
-                requests to embedding indices. First row is for embeddings
-                added by the LoRAs, second row is for the LoRA.lora_a
-                embeddings.
-            long_lora_indices: Tensor of shape [batch_size] mapping
-                requests to RoPE offsets and rot dims for long LoRAs.
-                None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors and prefill
-                flag. Used to index into each tensor. It contains 
-                (base_indices, sampler_indices, sampler_indices_padded, 
-                embeddings_indices, long_lora_indices, prefill flag). 
-    """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
-    embedding_indices = index_mapping_indices.copy()
-    lora_indices = index_mapping_indices.copy()
-    long_lora_offsets: Optional[torch.Tensor] = None
-    if long_lora_context:
-        long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device="cuda",
-                                        dtype=torch.long)
-    prompt_mapping: List[int] = [
-        lora_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
-    ]
-    lora_idx = None
-    for i in range(len(index_mapping_indices)):
-        # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
-        lora_indices[i] = lora_idx
-        if long_lora_context:
-            assert long_lora_offsets is not None
-            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
-                index_mapping_indices[i], 0)
-            long_lora_offsets[i] = lora_offset
-
-    indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices,
-        lora_indices,
-        embedding_indices,
-    ]
-    if long_lora_context:
-        assert long_lora_offsets is not None
-        indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device="cuda",
-                                         dtype=torch.long)
-    embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size),
-    ])
-    embeddings_indices[embeddings_indices == -1] = max_loras - 1
-    base_indices = indices[1]
-    sampler_indices = prompt_mapping_tensor
-    sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = torch.arange(
-        0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + (
-            sampler_indices_padded * len(sampler_indices_padded))
-    long_lora_indices = None
-    long_lora_indices_len: Optional[int] = None
-    if long_lora_context:
-        long_lora_indices = indices[3]
-        long_lora_indices_len = long_lora_indices.shape[-1]
-    # Contain length of indices tensors. Used to index into each tensor.
-    indices_len = [
-        base_indices.shape[-1],
-        sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1],
-        embeddings_indices.shape[-1],
-    ]
-    if long_lora_indices_len is not None:
-        indices_len.append(long_lora_indices_len)
-    else:
-        #If long_lora doesn't exist,append None
-        indices_len.append(None)
-    # Append a prefill flag to help selecting the appropriate lora
-    # ops (sgmv or bgmv)
-    indices_len.append(int(mapping.is_prefill))
-    return (
-        base_indices,
-        sampler_indices,
-        sampler_indices_padded,
-        embeddings_indices,
-        long_lora_indices,
-        indices_len,
-    )
-
-
 def get_lora_id():
     global _GLOBAL_LORA_ID
     _GLOBAL_LORA_ID += 1
@@ -440,32 +318,11 @@ def __init__(
         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
-        self.base_indices = torch.empty(self.max_num_batched_tokens,
-                                        dtype=torch.long,
-                                        device="cuda")
-        self.sampler_indices = torch.empty(self.max_num_batched_tokens,
-                                           dtype=torch.long,
-                                           device="cuda")
-        self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens,
-                                                  dtype=torch.long,
-                                                  device="cuda")
-        self.embeddings_indices = torch.empty(2,
-                                              self.max_num_batched_tokens,
-                                              dtype=torch.long,
-                                              device="cuda")
-        self.long_lora_indices = torch.empty(self.max_num_batched_tokens,
-                                             dtype=torch.long,
-                                             device="cuda")
-        # Scaling factor -> offset to the sin_cos_cache to it.
-        # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
-        # 6 is the number of indicies tensors.
-        # base_indices, sampler_indices, sampler_indices_padded,
-        # embeddings_indices,long_lora_indices,prefill or decode stage
-        self.indices_len: List[Optional[int]] = [None] * 6
-        self.prefill_helper = PrefillHelper(max_batches=_MAX_BATCH_SIZE,
-                                            device=str(
-                                                self.base_indices.device))
+        # maintain the state information for lora
+        self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
+                                            max_batches=_MAX_BATCH_SIZE,
+                                            device="cuda")
 
         self.model = model
         if hasattr(self.model, "supported_lora_modules"):
@@ -596,16 +453,9 @@ def pin_lora(self, lora_id: int) -> bool:
             "Pinning is not supported in LoRAModelManager."
             "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
 
-    # TODO see if this can be vectorized
     def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
-        (
-            base_indices,
-            sampler_indices,
-            sampler_indices_padded,
-            embeddings_indices,
-            long_lora_offsets_tensor,
-            indices_len,
-        ) = convert_mapping(
+        # update lora states
+        self.punica_wrapper.update_metadata(
             mapping,
             self.lora_index_to_id,
             self.lora_slots + 1,
@@ -613,23 +463,6 @@ def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
             self.lora_config.lora_extra_vocab_size,
             self.long_lora_context,
         )
-        self.base_indices[:base_indices.shape[0]].copy_(base_indices)
-        self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        self.embeddings_indices[:embeddings_indices.
-                                shape[0], :embeddings_indices.shape[1]].copy_(
-                                    embeddings_indices)
-        if long_lora_offsets_tensor is not None:
-            self.long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
-                long_lora_offsets_tensor)
-        else:
-            self.long_lora_indices.zero_()
-        # Maintain the reference
-        self.indices_len[:] = indices_len
-        if mapping.is_prefill:
-            self.prefill_helper.get_metadata(
-                self.base_indices[:base_indices.shape[0]], need_update=True)
 
     def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
         if self._last_mapping != lora_mapping:
@@ -691,14 +524,7 @@ def _create_lora_modules(self):
                 )
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
-            new_module.set_mapping(
-                self.base_indices,
-                self.sampler_indices,
-                self.sampler_indices_padded,
-                self.embeddings_indices,
-                self.long_lora_indices,
-                self.indices_len,
-            )
+            new_module.set_mapping(self.punica_wrapper)
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA)
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index f3a53b70f415..2873882bc263 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -159,10 +159,10 @@ def sgmv_expand(
             torch.bfloat16,
     ]:
         CAST_TYPE = True
-    grid = [
+    grid = (
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         batches,
-    ]
+    )
     _sgmv_expand_kernel[grid](
         inputs,
         lora_b_weights,
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 52c71c5095b5..2078a47d7e8e 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -171,10 +171,10 @@ def sgmv_expand_slice(
             torch.bfloat16,
     ]:
         CAST_TYPE = True
-    grid = [
+    grid = (
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         batches,
-    ]
+    )
     _sgmv_expand_slice_kernel[grid](
         inputs,
         lora_b_weights,
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index f295f0118f0b..094bc62d9da4 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -153,11 +153,11 @@ def sgmv_shrink(
     BLOCK_K = 32
     SPLIT_K = 8
     EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
-    grid = [
+    grid = (
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
         SPLIT_K,
         batches,
-    ]
+    )
 
     _sgmv_shrink_kernel[grid](
         inputs,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 5347a68852a0..da51105fd907 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -3,9 +3,9 @@
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
 Punica: Multi-Tenant LoRA Serving. 
 https://arxiv.org/abs/2310.18547
-"""
-
-from typing import Optional, Tuple
+# """
+# from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
 
 import torch
 
@@ -16,17 +16,23 @@
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
+if TYPE_CHECKING:
+    # avodi circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
+
 
+@torch.compile
 def _compute_meta(
     token_lora_tensor: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]:
     """
     Get the information required for the sgmv kernel. With the  features:
-    1. If consecutive requests in the batch use the same LoRA, this function 
-    will combine them into a single request, improving sgmv kernel inference 
+    1. If consecutive requests in the batch use the same LoRA, this function
+    will combine them into a single request, improving sgmv kernel inference
     performance.
-    2. At the beginning of each prefill stage inference, recalculations are 
-    needed based on the input, but only once. 
+    2. At the beginning of each prefill stage inference, recalculations are
+    needed based on the input, but only once.
     """
 
     lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
@@ -45,43 +51,229 @@ def _compute_meta(
     )
 
 
-class PrefillHelper:
-    """PrefillHelper is designed to manage and provide metadata for the sgmv 
-    kernel during  prefill stage, utilizing the singleton pattern to guarantee 
-    the existence of only one instance of the class.
+# TODO see if this can be vectorized
+def convert_mapping(
+    mapping: "LoRAMapping",
+    lora_index_to_id: List[Optional[int]],
+    max_loras: int,
+    vocab_size: int,
+    extra_vocab_size: int,
+    long_lora_context: Optional["LongContextLoRAContext"] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor], List[int]]:
+    """Converts LoRAMapping to index tensors.
+
+    Args:
+        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+        max_loras: Maximum number of LoRAs.
+        vocab_size: Model vocab size.
+        extra_vocab_size: Extra vocab size each LoRA can have.
+        long_lora_context: Passed if there are long context lora in a batch.
+
+    Returns:
+        A tuple of tensors:
+            base_indices: Tensor of shape [batch_size] mapping batch rows to
+                LoRA indices.
+            sampler_indices: Tensor of shape [batch_size] mapping requests to
+                LoRA indices for sampler. For generation, this will be the
+                same as base_indicies. For prefill, this will map requests
+                to LoRA indices.
+            sampler_indices_padded: Tensor of shape [batch_size] mapping
+                requests to LoRA indices for sampler with padding.
+                Same as sampler_indicies, but -1 is replaced with
+                max_loras.
+            embeddings_indices: Tensor of shape [2, batch_size] mapping
+                requests to embedding indices. First row is for embeddings
+                added by the LoRAs, second row is for the LoRA.lora_a
+                embeddings.
+            long_lora_indices: Tensor of shape [batch_size] mapping
+                requests to RoPE offsets and rot dims for long LoRAs.
+                None if long context lora doesn't exist.
+            indices_len: List of lengths of the above tensors. It contains
+                (base_indices, sampler_indices, sampler_indices_padded,
+                embeddings_indices, long_lora_indices).
     """
-    _instance: Optional["PrefillHelper"] = None
-    initialized: bool
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    long_lora_offsets: Optional[torch.Tensor] = None
+    if long_lora_context:
+        long_lora_offsets = torch.zeros(len(index_mapping_indices),
+                                        device="cuda",
+                                        dtype=torch.long)
+    prompt_mapping: List[int] = [
+        lora_index_to_id.index(x) if x > 0 else -1
+        for x in mapping.prompt_mapping
+    ]
+    lora_idx = None
+    for i in range(len(index_mapping_indices)):
+        # TODO index can be slow. optimize
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
+        if long_lora_context:
+            assert long_lora_offsets is not None
+            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
+                index_mapping_indices[i], 0)
+            long_lora_offsets[i] = lora_offset
+
+    indices_list: List[Union[List[int], torch.Tensor]] = [
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
+    ]
+    if long_lora_context:
+        assert long_lora_offsets is not None
+        indices_list.append(long_lora_offsets)
+    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         device="cuda",
+                                         dtype=torch.long)
+    embeddings_indices = torch.stack([
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
+    ])
+    embeddings_indices[embeddings_indices == -1] = max_loras - 1
+    base_indices = indices[1]
+    sampler_indices = prompt_mapping_tensor
+    sampler_indices_padded = sampler_indices.clone()
+    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
+    long_lora_indices = None
+    long_lora_indices_len: Optional[int] = None
+    if long_lora_context:
+        long_lora_indices = indices[3]
+        long_lora_indices_len = long_lora_indices.shape[-1]
+    # Contain length of indices tensors. Used to index into each tensor.
+    indices_len = [
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
+    ]
+    if long_lora_indices_len is not None:
+        indices_len.append(long_lora_indices_len)
+    else:
+        # If long_lora doesn't exist,append None
+        indices_len.append(None)
 
-    def __new__(cls, *args, **kwargs):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-            cls._instance.initialized = False
-        return cls._instance
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        long_lora_indices,
+        indices_len,
+    )
 
-    def __init__(self, max_batches: int = 256, device: str = "cuda"):
-        """
-        Args:
-            max_batches (int, optional):  the maximum batch to pre-allocate.
-                Defaults to 256.
-            device (str, optional): Defaults to "cuda".
-        """
-        if not self.initialized:
-            self.initialized = True
-            # these attributes are the information required for sgmv kernel
-            self.b_seq_start_tensor = torch.zeros(max_batches,
-                                                  dtype=torch.long,
-                                                  device=device)
-            self.seq_length_tensor = torch.empty(max_batches,
-                                                 dtype=torch.long,
-                                                 device=device)
-            self.lora_indices_tensor = torch.empty(max_batches,
+
+class PunicaWrapper:
+    """PunicaWrapper is designed to manage and provide metadata for the punica 
+    kernel. The main function  is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the punica operator.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: str):
+        self._token_lora_indices = torch.empty(max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._sampler_indices = torch.empty(max_num_batched_tokens,
+                                            dtype=torch.long,
+                                            device=device)
+        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
                                                    dtype=torch.long,
                                                    device=device)
-            self.max_length: int = 0
-            self.batch_size: int = -1
-
-    def _update_metada(self, token_lora_tensor: torch.Tensor) -> None:
+        self._embeddings_indices = torch.empty(2,
+                                               max_num_batched_tokens,
+                                               dtype=torch.long,
+                                               device=device)
+        self._long_lora_indices = torch.empty(max_num_batched_tokens,
+                                              dtype=torch.long,
+                                              device=device)
+
+        # 5 is the number of indicies tensors.
+        # base_indices, sampler_indices, sampler_indices_padded,
+        # embeddings_indices,long_lora_indices
+        self.indices_len: List[Optional[int]] = [None] * 5
+        # these attributes are the information required for sgmv kernel
+        self.b_seq_start_tensor = torch.zeros(max_batches,
+                                              dtype=torch.long,
+                                              device=device)
+        self.seq_length_tensor = torch.empty(max_batches,
+                                             dtype=torch.long,
+                                             device=device)
+        self.lora_indices_tensor = torch.empty(max_batches,
+                                               dtype=torch.long,
+                                               device=device)
+        self.max_length: int = 0
+        self.batch_size: int = -1
+        self.is_prefill = False
+
+    def update_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size,
+                                   long_lora_context)
+        if mapping.is_prefill:
+            # Update metadata required for prefill-related operators.
+            self._update_prefill_metada(self.token_lora_indices)
+            self.is_prefill = True
+        else:
+            self.is_prefill = False
+
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(
+            mapping,
+            lora_index_to_id,
+            max_loras,
+            vocab_size,
+            extra_vocab_size,
+            long_lora_context,
+        )
+        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded)
+        self._embeddings_indices[:embeddings_indices.
+                                 shape[0], :embeddings_indices.shape[1]].copy_(
+                                     embeddings_indices)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
+                long_lora_offsets_tensor)
+        else:
+            self._long_lora_indices.zero_()
+
+        self.indices_len[:] = indices_len
+
+    def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
 
         (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
          batch_size, max_length) = _compute_meta(token_lora_tensor)
@@ -95,280 +287,303 @@ def _update_metada(self, token_lora_tensor: torch.Tensor) -> None:
         self.batch_size = batch_size
         self.max_length = max_length
 
-    def get_metadata(
-        self,
-        token_lora_tensor: torch.Tensor,
-        need_update: bool = False
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, ]:
-
-        #Need to recalculate and fill metadata.
-        if need_update:
-            self._update_metada(token_lora_tensor)
-
+    @property
+    def prefill_metadata(
+            self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]:
+        """
+        This property provides a convenient way to access the necessary 
+        metadata for prefill-related  kernel computations. It returns a tuple 
+        containing:
+            1. b_seq_start_tensor: Tensor of sequence start positions
+            2. seq_length_tensor: Tensor of sequence lengths
+            3. lora_indices_tensor: Tensor of lora indices
+            4. batch_size: batch size after clustering identical lora indices
+            5. max_length: The maximum sequence length in the batch
+        """
         return (self.b_seq_start_tensor[:self.batch_size],
                 self.seq_length_tensor[:self.batch_size],
                 self.lora_indices_tensor[:self.batch_size], self.batch_size,
                 self.max_length)
 
+    @property
+    def token_lora_indices(self) -> torch.Tensor:
+        """
+        This property provides the lora indices corresponding to each token 
+        in the batch
+        """
+        token_lora_len = self.indices_len[0]
+        return self._token_lora_indices[:token_lora_len]
+
+    @property
+    def sampler_indices(self) -> torch.Tensor:
+        """ 
+        This property is used to access the lora indices specifically for 
+        LogitsProcessorWithLoRA
+        """
+        sampler_indices_len = self.indices_len[1]
+        return self._sampler_indices[:sampler_indices_len]
 
-def get_prefill_meta(token_lora_tensor: torch.Tensor,
-                     need_update: bool = False):
-    prefill_helper = PrefillHelper(max_batches=256,
-                                   device=str(token_lora_tensor.device))
-    return prefill_helper.get_metadata(token_lora_tensor, need_update)
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices
+        """
+        indices_padded_len = self.indices_len[2]
+        return self._sampler_indices_padded[:indices_padded_len]
 
+    @property
+    def embeddings_indices(self) -> torch.Tensor:
+        """
+        This property provides access to the indices used for lora embeddings, 
+        specifically for VocabParallelEmbeddingWithLoRA
+        """
+        embeddings_indices_len = self.indices_len[3]
+        return self._embeddings_indices[:, :embeddings_indices_len]
+
+    @property
+    def long_lora_indices(self) -> torch.Tensor:
+        """ 
+        This property provides access to the indices used for long context 
+        lora, specifically for LinearScalingRotaryEmbeddingWithLora
+        """
+        long_lora_len = self.indices_len[4]
+        return self._long_lora_indices[:long_lora_len]
 
-def shrink_prefill(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-    need_update: bool = False,
-):
-    (
-        b_seq_start_tensor,
-        seq_length_tensor,
-        last_lora_indices_tensor,
-        batch_size,
-        max_length,
-    ) = get_prefill_meta(lora_indices_tensor, need_update)
-    sgmv_shrink(
-        x,
-        w_t_all,
-        y,
-        b_seq_start_tensor,
-        seq_length_tensor,
-        last_lora_indices_tensor,
-        batch_size,
-        max_length,
-        scale,
-    )
+    def shrink_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        sgmv_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            scale,
+        )
+
+    def shrink_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
+
+    def expand_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool,
+    ):
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            add_input,
+        )
+
+    def expand_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool,
+    ):
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
+
+    def expand_slice_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool,
+    ):
+        sgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            y_offset,
+            y_slice_size,
+            add_input,
+        )
+
+    def expand_slice_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: Optional[int],
+        y_slice_size: Optional[int],
+        add_input: bool,
+    ):
+        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
+                          y_slice_size, add_input)
+
+    def add_shrink(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the shrink_decode function
+        should be called.
+        """
+        shrink_fun: Callable = (self.shrink_prefill
+                                if self.is_prefill else self.shrink_decode)
+        shrink_fun(y, x, w_t_all, scale)
 
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool = True,
+    ):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'b.
+        When `is_prefill` is true, it indicates that it is currently the
+        prefill stage, and the `expand_prefill` function should be called.
+        Otherwise, it is the decode stage, and the expand_decode function
+        should be called.
+        """
 
-def shrink_decode(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-):
-    bgmv_shrink(x, w_t_all, y, lora_indices_tensor, scale)
-
-
-def expand_prefill(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    add_input: bool,
-    need_update: bool = False,
-):
-    (
-        b_seq_start_tensor,
-        seq_length_tensor,
-        last_lora_indices_tensor,
-        batch_size,
-        max_length,
-    ) = get_prefill_meta(lora_indices_tensor, need_update)
-    sgmv_expand(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor,
-                last_lora_indices_tensor, batch_size, max_length, add_input)
-
-
-def expand_decode(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    add_input: bool,
-):
-    bgmv_expand(x, w_t_all, y, lora_indices_tensor, add_input)
-
-
-def expand_slice_prefill(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    y_offset: Optional[int],
-    y_slice_size: Optional[int],
-    add_input: bool,
-    need_update: bool = False,
-):
-    (
-        b_seq_start_tensor,
-        seq_length_tensor,
-        last_lora_indices_tensor,
-        batch_size,
-        max_length,
-    ) = get_prefill_meta(lora_indices_tensor, need_update)
-    sgmv_expand_slice(x, w_t_all, y, b_seq_start_tensor, seq_length_tensor,
-                      last_lora_indices_tensor, batch_size, max_length,
-                      y_offset, y_slice_size, add_input)
-
-
-def expand_slice_decode(y: torch.Tensor, x: torch.Tensor,
-                        w_t_all: torch.Tensor,
-                        lora_indices_tensor: torch.Tensor, layer_idx: int,
-                        y_offset: Optional[int], y_slice_size: Optional[int],
-                        add_input: bool):
-    bgmv_expand_slice(x, w_t_all, y, lora_indices_tensor, y_offset,
-                      y_slice_size, add_input)
-
-
-def add_shrink(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-    is_prefill: bool,
-    need_update: bool = False,
-):
-    """
-    Perform the ` y+=x@w_t_all` computation, which is suitable for the 
-    GEMM of lora'a.
-    When `is_prefill is` true, it indicates that it is currently the 
-    prefill stage, and the `shrink_prefill` function should be called. 
-    Otherwise, it is the decode stage, and the shrink_decode function 
-    should be called.
-    """
-    if is_prefill:
-        shrink_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx, scale,
-                       need_update)
-    else:
-        shrink_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, scale)
-
-
-def add_expand(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    is_prefill: bool,
-    add_input: bool = True,
-    need_update: bool = False,
-):
-    """
-    Perform the ` y+=x@w_t_all` computation, which is suitable for the 
-    GEMM of lora'b.
-    When `is_prefill` is true, it indicates that it is currently the 
-    prefill stage, and the `expand_prefill` function should be called. 
-    Otherwise, it is the decode stage, and the expand_decode function 
-    should be called.
-    """
-    if is_prefill:
-        expand_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx,
-                       add_input, need_update)
-    else:
-        expand_decode(y, x, w_t_all, lora_indices_tensor, layer_idx, add_input)
-
-
-def add_expand_slice(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    is_prefill: bool,
-    y_offset: Optional[int],
-    y_slice_size: Optional[int],
-    add_input: bool = True,
-    need_update: bool = False,
-):
-    """
-    Similar to `add_expand`
-    """
-    if is_prefill:
-        expand_slice_prefill(y, x, w_t_all, lora_indices_tensor, layer_idx,
-                             y_offset, y_slice_size, add_input, need_update)
-    else:
-        expand_slice_decode(y, x, w_t_all, lora_indices_tensor, layer_idx,
-                            y_offset, y_slice_size, add_input)
-
-
-def add_lora(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-    is_prefill: bool,
-    y_offset: Optional[int] = None,
-    y_slice_size: Optional[int] = None,
-    *,
-    buffer: Optional[torch.Tensor] = None,
-    need_update: bool = False,
-):
-    """
-    Semantics:
-      y[i] += (
-          x[i].unsqueeze(0)
-          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-          * scale
-        ).squeeze(0)
-    Args:
-        y (torch.Tensor):  Output tensor. Will be changed in-place.
-        x (torch.Tensor): Input tensor
-        wa_t_all (torch.Tensor): lora_a's weight
-        wb_t_all (torch.Tensor): lora_b's weight
-        lora_indices_tensor (torch.Tensor): _description_
-        layer_idx (int): Layer index of LoRA weights.
-        scale (float): Scaling factor.
-        is_prefill (bool): prefiling stage
-        y_offset (Optional[int], optional): Offset to apply to the starting 
-            column of y.
-        y_slice_size (Optional[int], optional): Size of the y column slice..
-        buffer (Optional[torch.Tensor], optional): Defaults to None.
-        need_update (bool, optional): Indicates whether updating sgmv metadata 
-            is needed. Defaults to False.
-    """
+        expand_fun: Callable = (self.expand_prefill
+                                if self.is_prefill else self.expand_decode)
+        expand_fun(y, x, w_t_all, add_input)
+
+    def add_expand_slice(self,
+                         y: torch.Tensor,
+                         x: torch.Tensor,
+                         w_t_all: torch.Tensor,
+                         y_offset: Optional[int],
+                         y_slice_size: Optional[int],
+                         add_input: bool = True):
+        """
+        Similar to `add_expand`
+        """
 
-    r = wb_t_all.size(-1)
-    if buffer is None:
-        # We set the buffer to be float32 by default ,refer to:
-        # https://github.com/triton-lang/triton/issues/1387
-        buffer = torch.zeros((x.size(0), r),
-                             dtype=torch.float32,
-                             device=x.device)
-
-    add_shrink(
-        buffer,
-        x,
-        wa_t_all,
-        lora_indices_tensor,
-        0,
-        scale,
-        is_prefill,
-        need_update=need_update,
-    )
-    if y_offset is None and y_slice_size is None:
-        add_expand(y,
-                   buffer,
-                   wb_t_all,
-                   lora_indices_tensor,
-                   0,
-                   is_prefill,
-                   add_input=True,
-                   need_update=need_update)
-    else:
-        add_expand_slice(y,
-                         buffer,
-                         wb_t_all,
-                         lora_indices_tensor,
-                         0,
-                         is_prefill,
-                         y_offset,
-                         y_slice_size,
-                         add_input=True,
-                         need_update=need_update)
+        expand_slice_fun: Callable = (self.expand_slice_prefill
+                                      if self.is_prefill else
+                                      self.expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+
+    def add_lora(self,
+                 y: torch.Tensor,
+                 x: torch.Tensor,
+                 wa_t_all: torch.Tensor,
+                 wb_t_all: torch.Tensor,
+                 scale: float,
+                 y_offset: Optional[int] = None,
+                 y_slice_size: Optional[int] = None,
+                 *,
+                 buffer: Optional[torch.Tensor] = None) -> None:
+        """
+        Semantics:
+        y[i] += (
+            x[i].unsqueeze(0)
+            @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+            @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+            * scale
+            ).squeeze(0)
+        Args:
+            y (torch.Tensor):  Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            wa_t_all (torch.Tensor): lora_a's weight
+            wb_t_all (torch.Tensor): lora_b's weight
+            scale (float): Scaling factor.
+            y_offset (Optional[int], optional): Offset to apply to the starting
+                column of y.
+            y_slice_size (Optional[int], optional): Size of the y column slice..
+            buffer (Optional[torch.Tensor], optional): Defaults to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = wb_t_all.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+
+        self.add_shrink(buffer, x, wa_t_all, scale)
+        if y_offset is None and y_slice_size is None:
+            self.add_expand(y, buffer, wb_t_all, add_input=True)
+        else:
+            self.add_expand_slice(y,
+                                  buffer,
+                                  wb_t_all,
+                                  y_offset,
+                                  y_slice_size,
+                                  add_input=True)
+        y = y.view_as(y_org)
+
+    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
+                               lora_a_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               lora_b_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               scale: float,
+                               output_slices: Tuple[int, ...]) -> None:
+        """
+        Applies lora to each input. Similar to add_lora, This method is 
+        used for layers that are composed of multiple sublayers
+        (slices) packed together.
+        """
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+        # TODO fuse these kernels
+        for slice_idx in range(len(output_slices)):
+            self.add_lora(y, x, lora_a_stacked[slice_idx],
+                          lora_b_stacked[slice_idx], scale, offset_left,
+                          output_slices[slice_idx])
+            offset_left += output_slices[slice_idx]
+
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        wa_t_all: torch.Tensor,
+                        wb_t_all: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None) -> None:
+        """
+        LogitsProcessorWithLoRA always using bgmv
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = wb_t_all.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+
+        bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True)
+        y = y.view_as(y_org)

From 7035a2903d1c303122f0a06e5b89a347977786bb Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 11 Jul 2024 11:38:49 +0800
Subject: [PATCH 49/71] update lora unit test

---
 tests/lora/test_layers.py   | 140 +++++++++++--------
 tests/lora/test_lora.py     | 263 ------------------------------------
 vllm/worker/model_runner.py |  12 +-
 3 files changed, 89 insertions(+), 326 deletions(-)
 delete mode 100644 tests/lora/test_lora.py

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 7207af6b1a4b..6f33f56616fc 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -26,7 +26,8 @@
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
 from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
-                              PackedLoRALayerWeights, convert_mapping)
+                              PackedLoRALayerWeights)
+from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -47,6 +48,9 @@
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
+# We will launch different triton kernels between the prefill and decode
+# stages, so we need to verify this. prefill stage(True) or decode stage(False)
+STAGES = [True, False]
 
 
 def get_random_id_to_index(num_loras: int,
@@ -182,10 +186,12 @@ def create_random_inputs(
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -204,7 +210,7 @@ def create_random_embedding_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         embedding, lora_embedding = create_random_embedding_layer()
-
+        lora_embedding.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_embedding,
@@ -217,12 +223,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info)
 
         lora_result = lora_embedding(torch.cat(inputs))
 
@@ -255,12 +261,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
 
         lora_result = lora_embedding(torch.cat(inputs))
         expected_result = embedding(torch.cat(inputs))
@@ -278,11 +284,13 @@ def create_random_embedding_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+@pytest.mark.parametrize("stage", STAGES)
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
-                                        vocab_size) -> None:
+                                        vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -318,6 +326,7 @@ def create_random_embedding_layer():
             generate_embeddings_tensor=256,
         )
 
+        lora_embedding.set_mapping(punica_wrapper)
         # All embeddings tensors have the same shape.
         embeddings_tensors = [
             lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
@@ -334,8 +343,12 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
         original_inputs = deepcopy(inputs)
 
         # Force some of the inputs to be in the extended embeddings range
@@ -349,11 +362,6 @@ def create_random_embedding_layer():
                 (embedding_id + 1) * embeddings_tensor_len - 1)
             original_input_[-2] = vocab_size + embeddings_tensor_len - 1
 
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
-
         expanded_embedding.weight[vocab_size:vocab_size +
                                   (embeddings_tensor_len *
                                    max_loras)] = torch.cat(embeddings_tensors)
@@ -390,15 +398,13 @@ def create_random_embedding_layer():
             input_size=(200, ),
             input_range=(1, vocab_size),
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
         original_inputs = deepcopy(inputs)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        vocab_size,
                                        lora_config.lora_extra_vocab_size)
-        lora_embedding.set_mapping(*mapping_info, )
-
         lora_result = lora_embedding(torch.cat(original_inputs))
         expected_result = expanded_embedding(torch.cat(inputs))
 
@@ -413,11 +419,13 @@ def create_random_embedding_layer():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-def test_lm_head_logits_processor(dist_init, num_loras, device,
-                                  vocab_size) -> None:
+@pytest.mark.parametrize("stage", STAGES)
+def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
+                                  stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16)
@@ -443,7 +451,7 @@ def _pretest():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, logits_processor, lora_logits_processor = _pretest()
-
+        lora_logits_processor.set_mapping(punica_wrapper)
         # NOTE: all the generated loras share the same embeddings tensor.
         lora_dict, _ = populate_loras(
             id_to_index,
@@ -461,17 +469,17 @@ def _pretest():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        input_ = torch.rand(20, 1024)
-        mapping_info = convert_mapping(
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             vocab_size,
             lora_config.lora_extra_vocab_size,
         )
-        lora_logits_processor.set_mapping(*mapping_info, )
+        input_ = torch.rand(20, 1024)
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
@@ -510,12 +518,16 @@ def _pretest():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
-        lora_logits_processor.set_mapping(*mapping_info, )
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
 
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
@@ -538,10 +550,12 @@ def _pretest():
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device) -> None:
+                         device, stage) -> None:
 
     torch.set_default_device(device)
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -575,7 +589,7 @@ def create_random_linear_parallel_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, lora_linear = create_random_linear_parallel_layer()
-
+        lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_linear,
@@ -589,16 +603,16 @@ def create_random_linear_parallel_layer():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-
-        mapping_info = convert_mapping(
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info, )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
@@ -628,11 +642,12 @@ def create_random_linear_parallel_layer():
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
                                        512, lora_config.lora_extra_vocab_size)
-        lora_linear.set_mapping(*mapping_info, )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
@@ -649,10 +664,12 @@ def create_random_linear_parallel_layer():
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("stage", STAGES)
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device) -> None:
+                                device, stage) -> None:
 
     torch.set_default_device(device)
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -707,7 +724,7 @@ class FakeConfig:
         id_to_index = get_random_id_to_index(num_loras, max_loras)
 
         linear, lora_linear = create_column_parallel_packed_layer()
-
+        lora_linear.set_mapping(punica_wrapper)
         lora_dict, sublora_dict = populate_loras(
             id_to_index,
             layer=lora_linear,
@@ -722,16 +739,17 @@ class FakeConfig:
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
@@ -762,16 +780,18 @@ class FakeConfig:
             input_range=(0, 1),
             input_type=torch.float16,
         )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+        lora_mapping = LoRAMapping(index_mapping,
+                                   prompt_mapping,
+                                   is_prefill=stage)
 
-        mapping_info = convert_mapping(
+        punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
             max_loras,
             512,
             lora_config.lora_extra_vocab_size,
         )
-        lora_linear.set_mapping(*mapping_info)
+        # lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
@@ -803,7 +823,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
-
+    punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -825,6 +845,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         is_neox_style,
     )
     lora_rope = LinearScalingRotaryEmbeddingWithLora(rope)
+    lora_rope.set_mapping(punica_wrapper)
     lora_rope.create_lora_weights(max_loras, lora_config)
     linear_rope = get_rope(head_size, rotary_dim, max_position, base,
                            is_neox_style, {
@@ -840,6 +861,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         input_range=(0, lora_config.lora_extra_vocab_size),
         input_type=torch.float16,
     )
+
     lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
     long_lora_context = LongContextLoRAContext(list(scaling_factors),
                                                rotary_dim)
@@ -854,7 +876,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
     for i in range(len(scaling_factors)):
         long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
             scaling_factors[i], 0)
-    mapping_info = convert_mapping(
+    punica_wrapper.update_metadata(
         lora_mapping,
         id_to_index,
         max_loras,
@@ -862,7 +884,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         lora_config.lora_extra_vocab_size,
         long_lora_context=long_lora_context,
     )
-    lora_rope.set_mapping(*mapping_info)
+    # lora_rope.set_mapping(*mapping_info)
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
     query = torch.randn(batch_size,
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
deleted file mode 100644
index a4ca7a93e62e..000000000000
--- a/tests/lora/test_lora.py
+++ /dev/null
@@ -1,263 +0,0 @@
-import pytest
-import torch
-
-from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
-
-from .utils import DummyLoRAManager
-
-TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
-QKV_TENSOR_SIZES = [
-    (8192, 1024, 1024),
-    (8192 // 8, 1024 // 8, 1024 // 8),
-    (4096, 4096, 4096),
-    (4096 // 2, 4096 // 2, 4096 // 2),
-]
-BATCH_SIZES = [8, 32, 256]
-RANKS = [8]
-DTYPES = [torch.float16]
-TOLERANCES = {
-    torch.float16: (5e-3, 5e-3),
-    torch.bfloat16: (3e-2, 2e-2),
-}
-
-STAGES = [0, 1]  #prefill stage(1) or decode stage(0)
-
-
-@pytest.mark.parametrize("m", TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("stage", STAGES)
-def test_apply_lora(m, n, k, rank, dtype, stage) -> None:
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight = torch.rand([m, n], device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name, weight, rank=rank)
-    lora = manager.get_module_lora(module_name)
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
-
-    lora_a_stack = torch.zeros(8,
-                               1,
-                               lora.lora_a.shape[1],
-                               lora.lora_a.shape[0],
-                               device="cuda",
-                               dtype=dtype)
-    lora_b_stack = torch.zeros(8,
-                               1,
-                               lora.lora_b.shape[1],
-                               lora.lora_b.shape[0],
-                               device="cuda",
-                               dtype=dtype)
-    for i in range(lora_a_stack.shape[0]):
-        lora_a_stack[i][0] = lora.lora_a.T
-        lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
-    indices_info = [None] * 6
-    indices_info[0] = k
-    indices_info[5] = stage
-    output = torch.zeros(k, m, device="cuda", dtype=dtype)
-    _apply_lora(input,
-                lora_a_stack,
-                lora_b_stack,
-                torch.randint(0,
-                              lora_a_stack.shape[0], (len(input), ),
-                              device="cuda"),
-                indices_info,
-                output,
-                need_update=True)
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora(input,
-                lora_a_stack,
-                lora_b_stack,
-                torch.full((len(input), ), -1, device="cuda"),
-                indices_info,
-                output,
-                need_update=True)
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
-
-
-@pytest.mark.parametrize("m", TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("stage", STAGES)
-def test_apply_lora_packed_2slice(m, n, k, rank, dtype, stage) -> None:
-    if m % 2 != 0:
-        pytest.skip("m must be divisible by 2")
-    if m // 2 not in TENSOR_SIZES:
-        pytest.skip("m//2 must be in TENSOR_SIZES")
-
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name + "1", weight, rank=rank)
-    lora_1 = manager.get_module_lora(module_name + "1")
-    manager.init_random_lora(module_name + "2", weight, rank=rank)
-    lora_2 = manager.get_module_lora(module_name + "2")
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = torch.cat([
-        input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
-        input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
-    ],
-                         dim=1)
-
-    lora_a_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_1.lora_a.shape[1],
-                    lora_1.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    lora_b_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_1.lora_b.shape[1],
-                    lora_1.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    for i in range(lora_a_stacks[0].shape[0]):
-        lora_a_stacks[0][i][0] = lora_1.lora_a.T
-        lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
-        lora_a_stacks[1][i][0] = lora_2.lora_a.T
-        lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
-    indices_info = [None] * 6
-    indices_info[0] = k
-    indices_info[5] = stage
-    output = torch.zeros(k, m, device="cuda", dtype=dtype)
-    _apply_lora_packed_nslice(input,
-                              lora_a_stacks,
-                              lora_b_stacks,
-                              torch.randint(0,
-                                            lora_a_stacks[0].shape[0],
-                                            (len(input), ),
-                                            device="cuda"),
-                              indices_info,
-                              output, (m // 2, m // 2),
-                              need_update=True)
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora_packed_nslice(input,
-                              lora_a_stacks,
-                              lora_b_stacks,
-                              torch.full((len(input), ), -1, device="cuda"),
-                              indices_info,
-                              output, (m // 2, m // 2),
-                              need_update=True)
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
-
-
-@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("stage", STAGES)
-def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype, stage) -> None:
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
-    weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
-
-    manager.init_random_lora(module_name + "q", weight_q, rank=rank)
-    lora_q = manager.get_module_lora(module_name + "q")
-    manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
-    lora_k = manager.get_module_lora(module_name + "k")
-    manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
-    lora_v = manager.get_module_lora(module_name + "v")
-
-    input = torch.rand(k, n, device="cuda", dtype=dtype)
-    expected = torch.cat([
-        input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
-        input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
-        input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
-    ],
-                         dim=1)
-
-    lora_a_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_q.lora_a.shape[1],
-                    lora_q.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype)
-    ] + [
-        torch.zeros(8,
-                    1,
-                    lora_k.lora_a.shape[1],
-                    lora_k.lora_a.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    lora_b_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_q.lora_b.shape[1],
-                    lora_q.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype)
-    ] + [
-        torch.zeros(8,
-                    1,
-                    lora_k.lora_b.shape[1],
-                    lora_k.lora_b.shape[0],
-                    device="cuda",
-                    dtype=dtype) for i in range(2)
-    ]
-    for i in range(lora_a_stacks[0].shape[0]):
-        lora_a_stacks[0][i][0] = lora_q.lora_a.T
-        lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
-        lora_a_stacks[1][i][0] = lora_k.lora_a.T
-        lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
-        lora_a_stacks[2][i][0] = lora_v.lora_a.T
-        lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
-    indices_info = [None] * 6
-    indices_info[0] = k
-    indices_info[5] = stage  #decoding stage
-    output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
-    _apply_lora_packed_nslice(input,
-                              lora_a_stacks,
-                              lora_b_stacks,
-                              torch.randint(0,
-                                            lora_a_stacks[0].shape[0],
-                                            (len(input), ),
-                                            device="cuda"),
-                              indices_info,
-                              output, (qkv[0], qkv[1], qkv[2]),
-                              need_update=True)
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    _apply_lora_packed_nslice(input,
-                              lora_a_stacks,
-                              lora_b_stacks,
-                              torch.full((len(input), ), -1, device="cuda"),
-                              indices_info,
-                              output, (qkv[0], qkv[1], qkv[2]),
-                              need_update=True)
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 7fc5febcd249..cdb84caebcfc 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -790,8 +790,10 @@ def _prepare_model_input_tensors(
             )
 
         if self.lora_config:
-            lora_mapping = LoRAMapping(lora_index_mapping, lora_prompt_mapping,
-                                       is_prompt)
+            lora_mapping = LoRAMapping(
+                **dict(index_mapping=lora_index_mapping,
+                       prompt_mapping=lora_prompt_mapping,
+                       is_prefill=is_prompt))
         else:
             lora_mapping = None
 
@@ -1138,8 +1140,10 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         )
 
                     if self.lora_config:
-                        lora_mapping = LoRAMapping([0] * batch_size,
-                                                   [0] * batch_size, False)
+                        lora_mapping = LoRAMapping(
+                            **dict(index_mapping=[0] * batch_size,
+                                   prompt_mapping=[0] * batch_size,
+                                   is_prefill=False))
                         self.set_active_loras(set(), lora_mapping)
 
                     if self.prompt_adapter_config:

From 391d7614dedee03fa6c44c7cdec768559b6d1841 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 12 Jul 2024 14:05:20 +0800
Subject: [PATCH 50/71] reduce triton overhead

---
 vllm/lora/ops/libentry.py          | 137 +++++++++++++++++++++++++++++
 vllm/lora/ops/sgmv_expand.py       |   3 +
 vllm/lora/ops/sgmv_expand_slice.py |   3 +
 vllm/lora/ops/sgmv_shrink.py       |   3 +
 vllm/lora/punica.py                |  54 ++++++------
 5 files changed, 172 insertions(+), 28 deletions(-)
 create mode 100644 vllm/lora/ops/libentry.py

diff --git a/vllm/lora/ops/libentry.py b/vllm/lora/ops/libentry.py
new file mode 100644
index 000000000000..d3fcc1d8e05b
--- /dev/null
+++ b/vllm/lora/ops/libentry.py
@@ -0,0 +1,137 @@
+# Modified from: https://github.com/FlagOpen/FlagGems
+import inspect
+
+import triton
+
+
+class LibEntry(triton.KernelInterface):
+
+    def __init__(
+        self,
+        fn,
+    ):
+        self.fn = fn
+        self.arg_names = fn.arg_names
+        self.divisibility = 16
+        self.kernel_cache = dict()
+        fn = self.fn
+        while not isinstance(fn, triton.runtime.JITFunction):
+            fn = fn.fn
+        self.jit_function: triton.runtime.JITFunction = fn
+        self.specialize_indices = [
+            p.num for p in self.jit_function.params
+            if not p.is_constexpr and not p.do_not_specialize
+        ]
+        self.do_not_specialize_indices = [
+            p.num for p in self.jit_function.params
+            if not p.is_constexpr and p.do_not_specialize
+        ]
+
+    def key(self, spec_args, dns_args, const_args):
+        spec_key = [(arg.dtype, arg.data_ptr() %
+                     self.divisibility == 0) if hasattr(arg, "data_ptr") else
+                    (type(arg), arg) for arg in spec_args]
+        dns_key = [
+            arg.dtype if hasattr(
+                arg, "data_ptr") else type(arg) if not isinstance(arg, int)
+            else "i32" if -(2**31) <= arg and arg <= 2**31 -
+            1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64"
+            for arg in dns_args
+        ]
+        # const args passed by position
+        return tuple(spec_key + dns_key + const_args)
+
+    def run(self, *args, **kwargs):
+        grid = kwargs["grid"]
+
+        # collect all the arguments
+        spec_args = []  # specialize arguments
+        dns_args = []  # do not specialize arguments
+        const_args = []  # constexpr arguments
+        k_args = []  # kernel arguments
+        for i, arg in enumerate(args):
+            if i in self.specialize_indices:
+                k_args.append(arg)
+                spec_args.append(arg)
+            elif i in self.do_not_specialize_indices:
+                k_args.append(arg)
+                dns_args.append(arg)
+            else:
+                const_args.append(arg)
+        for p in self.jit_function.params[len(args):]:
+            if p.name in kwargs:
+                val = kwargs[p.name]
+            elif p.default is inspect._empty:
+                continue
+            else:
+                val = p.default
+
+            if p.is_constexpr:
+                const_args.append(val)
+            elif p.do_not_specialize:
+                dns_args.append(val)
+                k_args.append(val)
+            else:
+                spec_args.append(val)
+                k_args.append(val)
+
+        entry_key = self.key(spec_args, dns_args, const_args)
+
+        if entry_key not in self.kernel_cache:
+            kernel = self.fn.run(*args, **kwargs)
+            fn = self.fn
+            # collect constexpr arguments for grid computation
+            constexprs = {}
+            while not isinstance(fn, triton.runtime.JITFunction):
+                if isinstance(fn, triton.runtime.Autotuner):
+                    config = fn.best_config
+                    constexprs["num_warps"] = config.num_warps
+                    constexprs["num_stages"] = config.num_stages
+                    constexprs["num_ctas"] = config.num_ctas
+                    constexprs = {**constexprs, **config.kwargs}
+                elif isinstance(fn, triton.runtime.Heuristics):
+                    for v, heur in fn.values.items():
+                        constexprs[v] = heur({
+                            **dict(zip(fn.arg_names, args)),
+                            **kwargs,
+                            **constexprs,
+                        })
+                else:
+                    raise RuntimeError("Invalid Runtime Function")
+                fn = fn.fn
+            for p in self.jit_function.params:
+                if p.is_constexpr and p.name not in constexprs:
+                    constexprs[p.name] = p.default
+            self.kernel_cache[entry_key] = (kernel, constexprs)
+        else:
+            kernel, constexprs = self.kernel_cache[entry_key]
+
+        if callable(grid):
+            # collect all arguments to the grid fn，ie:
+            # 1. args,
+            # 2. kwargs,
+            # 3. all all other captured arguments in CompiledKernel from
+            # Autotunner & Heuristics when kwargs & captured args conflict,
+            # captured args have higher priority
+            meta = {**dict(zip(self.arg_names, args)), **kwargs, **constexprs}
+            grid = grid(meta)
+
+        grid = grid + (1, 1)
+
+        kernel[grid[0:3]](*k_args)
+        return
+
+
+def libentry():
+    """
+    Decorator for triton library entries.
+    Motivation:
+        The runtime overhead of Triton kernels is the reason for the lower 
+        performance of small kernels, particularly evident with smaller models. 
+        Using this decorator can reduce Triton runtime overhead.
+    """
+
+    def decorator(fn):
+        return LibEntry(fn)
+
+    return decorator
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 2873882bc263..27e91f5d1e4e 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -9,7 +9,10 @@
 import triton
 import triton.language as tl
 
+from .libentry import libentry
 
+
+@libentry()
 @triton.jit
 def _sgmv_expand_kernel(
     input_ptr,
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 2078a47d7e8e..2906500e7873 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -9,7 +9,10 @@
 import triton
 import triton.language as tl
 
+from .libentry import libentry
 
+
+@libentry()
 @triton.jit
 def _sgmv_expand_slice_kernel(
     input_ptr,
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 094bc62d9da4..c5bc1c08364c 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -9,7 +9,10 @@
 import triton
 import triton.language as tl
 
+from .libentry import libentry
 
+
+@libentry()
 @triton.jit
 def _sgmv_shrink_kernel(
     input_ptr,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index da51105fd907..16d41cfa11ff 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -3,8 +3,8 @@
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
 Punica: Multi-Tenant LoRA Serving. 
 https://arxiv.org/abs/2310.18547
-# """
-# from dataclasses import dataclass, field
+"""
+
 from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
 
 import torch
@@ -22,8 +22,7 @@
     from vllm.lora.models import LongContextLoRAContext
 
 
-@torch.compile
-def _compute_meta(
+def compute_meta(
     token_lora_tensor: torch.Tensor
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]:
     """
@@ -172,7 +171,8 @@ def convert_mapping(
 
 
 class PunicaWrapper:
-    """PunicaWrapper is designed to manage and provide metadata for the punica 
+    """
+    PunicaWrapper is designed to manage and provide metadata for the punica 
     kernel. The main function  is to maintain the state information for 
     Multi-LoRA, and to provide the interface for the punica operator.
     """
@@ -201,15 +201,15 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         # embeddings_indices,long_lora_indices
         self.indices_len: List[Optional[int]] = [None] * 5
         # these attributes are the information required for sgmv kernel
-        self.b_seq_start_tensor = torch.zeros(max_batches,
-                                              dtype=torch.long,
-                                              device=device)
-        self.seq_length_tensor = torch.empty(max_batches,
-                                             dtype=torch.long,
-                                             device=device)
-        self.lora_indices_tensor = torch.empty(max_batches,
-                                               dtype=torch.long,
-                                               device=device)
+        self._seq_start_locs = torch.empty(max_batches,
+                                           dtype=torch.long,
+                                           device=device)
+        self._seq_lengths = torch.empty(max_batches,
+                                        dtype=torch.long,
+                                        device=device)
+        self._lora_indices_per_batch = torch.empty(max_batches,
+                                                   dtype=torch.long,
+                                                   device=device)
         self.max_length: int = 0
         self.batch_size: int = -1
         self.is_prefill = False
@@ -276,13 +276,12 @@ def _update_base_metadata(
     def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
 
         (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
-         batch_size, max_length) = _compute_meta(token_lora_tensor)
+         batch_size, max_length) = compute_meta(token_lora_tensor)
 
-        self.b_seq_start_tensor[:b_seq_start_tensor.shape[0]].copy_(
+        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
             b_seq_start_tensor)
-        self.seq_length_tensor[:seq_length_tensor.shape[0]].copy_(
-            seq_length_tensor)
-        self.lora_indices_tensor[:lora_indices_tensor.shape[0]].copy_(
+        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
+        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
             lora_indices_tensor)
         self.batch_size = batch_size
         self.max_length = max_length
@@ -292,18 +291,17 @@ def prefill_metadata(
             self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]:
         """
         This property provides a convenient way to access the necessary 
-        metadata for prefill-related  kernel computations. It returns a tuple 
-        containing:
-            1. b_seq_start_tensor: Tensor of sequence start positions
-            2. seq_length_tensor: Tensor of sequence lengths
-            3. lora_indices_tensor: Tensor of lora indices
+        metadata for prefill-related  kernel computations.
+            1. seq_start_locs: Tensor of sequence start positions
+            2. seq_lengths: Tensor of sequence lengths
+            3. lora_indices_per_batch: Tensor of lora indices
             4. batch_size: batch size after clustering identical lora indices
             5. max_length: The maximum sequence length in the batch
         """
-        return (self.b_seq_start_tensor[:self.batch_size],
-                self.seq_length_tensor[:self.batch_size],
-                self.lora_indices_tensor[:self.batch_size], self.batch_size,
-                self.max_length)
+        return (self._seq_start_locs[:self.batch_size],
+                self._seq_lengths[:self.batch_size],
+                self._lora_indices_per_batch[:self.batch_size],
+                self.batch_size, self.max_length)
 
     @property
     def token_lora_indices(self) -> torch.Tensor:

From 1dc8ec0e545ae4caf7f6462724d6ce23703754eb Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 12 Jul 2024 14:31:51 +0800
Subject: [PATCH 51/71] delete libentry

---
 vllm/lora/ops/libentry.py          | 137 -----------------------------
 vllm/lora/ops/sgmv_expand.py       |   3 -
 vllm/lora/ops/sgmv_expand_slice.py |   3 -
 vllm/lora/ops/sgmv_shrink.py       |   3 -
 4 files changed, 146 deletions(-)
 delete mode 100644 vllm/lora/ops/libentry.py

diff --git a/vllm/lora/ops/libentry.py b/vllm/lora/ops/libentry.py
deleted file mode 100644
index d3fcc1d8e05b..000000000000
--- a/vllm/lora/ops/libentry.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Modified from: https://github.com/FlagOpen/FlagGems
-import inspect
-
-import triton
-
-
-class LibEntry(triton.KernelInterface):
-
-    def __init__(
-        self,
-        fn,
-    ):
-        self.fn = fn
-        self.arg_names = fn.arg_names
-        self.divisibility = 16
-        self.kernel_cache = dict()
-        fn = self.fn
-        while not isinstance(fn, triton.runtime.JITFunction):
-            fn = fn.fn
-        self.jit_function: triton.runtime.JITFunction = fn
-        self.specialize_indices = [
-            p.num for p in self.jit_function.params
-            if not p.is_constexpr and not p.do_not_specialize
-        ]
-        self.do_not_specialize_indices = [
-            p.num for p in self.jit_function.params
-            if not p.is_constexpr and p.do_not_specialize
-        ]
-
-    def key(self, spec_args, dns_args, const_args):
-        spec_key = [(arg.dtype, arg.data_ptr() %
-                     self.divisibility == 0) if hasattr(arg, "data_ptr") else
-                    (type(arg), arg) for arg in spec_args]
-        dns_key = [
-            arg.dtype if hasattr(
-                arg, "data_ptr") else type(arg) if not isinstance(arg, int)
-            else "i32" if -(2**31) <= arg and arg <= 2**31 -
-            1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64"
-            for arg in dns_args
-        ]
-        # const args passed by position
-        return tuple(spec_key + dns_key + const_args)
-
-    def run(self, *args, **kwargs):
-        grid = kwargs["grid"]
-
-        # collect all the arguments
-        spec_args = []  # specialize arguments
-        dns_args = []  # do not specialize arguments
-        const_args = []  # constexpr arguments
-        k_args = []  # kernel arguments
-        for i, arg in enumerate(args):
-            if i in self.specialize_indices:
-                k_args.append(arg)
-                spec_args.append(arg)
-            elif i in self.do_not_specialize_indices:
-                k_args.append(arg)
-                dns_args.append(arg)
-            else:
-                const_args.append(arg)
-        for p in self.jit_function.params[len(args):]:
-            if p.name in kwargs:
-                val = kwargs[p.name]
-            elif p.default is inspect._empty:
-                continue
-            else:
-                val = p.default
-
-            if p.is_constexpr:
-                const_args.append(val)
-            elif p.do_not_specialize:
-                dns_args.append(val)
-                k_args.append(val)
-            else:
-                spec_args.append(val)
-                k_args.append(val)
-
-        entry_key = self.key(spec_args, dns_args, const_args)
-
-        if entry_key not in self.kernel_cache:
-            kernel = self.fn.run(*args, **kwargs)
-            fn = self.fn
-            # collect constexpr arguments for grid computation
-            constexprs = {}
-            while not isinstance(fn, triton.runtime.JITFunction):
-                if isinstance(fn, triton.runtime.Autotuner):
-                    config = fn.best_config
-                    constexprs["num_warps"] = config.num_warps
-                    constexprs["num_stages"] = config.num_stages
-                    constexprs["num_ctas"] = config.num_ctas
-                    constexprs = {**constexprs, **config.kwargs}
-                elif isinstance(fn, triton.runtime.Heuristics):
-                    for v, heur in fn.values.items():
-                        constexprs[v] = heur({
-                            **dict(zip(fn.arg_names, args)),
-                            **kwargs,
-                            **constexprs,
-                        })
-                else:
-                    raise RuntimeError("Invalid Runtime Function")
-                fn = fn.fn
-            for p in self.jit_function.params:
-                if p.is_constexpr and p.name not in constexprs:
-                    constexprs[p.name] = p.default
-            self.kernel_cache[entry_key] = (kernel, constexprs)
-        else:
-            kernel, constexprs = self.kernel_cache[entry_key]
-
-        if callable(grid):
-            # collect all arguments to the grid fn，ie:
-            # 1. args,
-            # 2. kwargs,
-            # 3. all all other captured arguments in CompiledKernel from
-            # Autotunner & Heuristics when kwargs & captured args conflict,
-            # captured args have higher priority
-            meta = {**dict(zip(self.arg_names, args)), **kwargs, **constexprs}
-            grid = grid(meta)
-
-        grid = grid + (1, 1)
-
-        kernel[grid[0:3]](*k_args)
-        return
-
-
-def libentry():
-    """
-    Decorator for triton library entries.
-    Motivation:
-        The runtime overhead of Triton kernels is the reason for the lower 
-        performance of small kernels, particularly evident with smaller models. 
-        Using this decorator can reduce Triton runtime overhead.
-    """
-
-    def decorator(fn):
-        return LibEntry(fn)
-
-    return decorator
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 27e91f5d1e4e..2873882bc263 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -9,10 +9,7 @@
 import triton
 import triton.language as tl
 
-from .libentry import libentry
 
-
-@libentry()
 @triton.jit
 def _sgmv_expand_kernel(
     input_ptr,
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 2906500e7873..2078a47d7e8e 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -9,10 +9,7 @@
 import triton
 import triton.language as tl
 
-from .libentry import libentry
 
-
-@libentry()
 @triton.jit
 def _sgmv_expand_slice_kernel(
     input_ptr,
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index c5bc1c08364c..094bc62d9da4 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -9,10 +9,7 @@
 import triton
 import triton.language as tl
 
-from .libentry import libentry
 
-
-@libentry()
 @triton.jit
 def _sgmv_shrink_kernel(
     input_ptr,

From 9585adba77f7012247cf587f16fa6ab224d3f1ea Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 15 Jul 2024 18:20:05 +0800
Subject: [PATCH 52/71] delete punica_c code

---
 .github/workflows/scripts/build.sh           |    2 -
 CMakeLists.txt                               |   62 -
 Dockerfile                                   |    2 -
 Dockerfile.rocm                              |    3 +-
 csrc/punica/LICENSE                          |  217 ---
 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_config.h               |  218 ---
 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu      |    5 -
 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu      |    5 -
 csrc/punica/bgmv/bgmv_impl.cuh               |  451 ------
 csrc/punica/bgmv/generator.py                |   48 -
 csrc/punica/bgmv/vec_dtypes.cuh              | 1325 ------------------
 csrc/punica/punica_ops.cu                    |  569 --------
 csrc/punica/punica_ops.h                     |   11 -
 csrc/punica/torch_bindings.cpp               |   18 -
 csrc/punica/type_convert.h                   |   82 --
 docs/source/getting_started/installation.rst |    1 -
 setup.py                                     |   10 -
 vllm/envs.py                                 |    5 -
 22 files changed, 1 insertion(+), 3053 deletions(-)
 delete mode 100644 csrc/punica/LICENSE
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_config.h
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
 delete mode 100644 csrc/punica/bgmv/bgmv_impl.cuh
 delete mode 100644 csrc/punica/bgmv/generator.py
 delete mode 100644 csrc/punica/bgmv/vec_dtypes.cuh
 delete mode 100644 csrc/punica/punica_ops.cu
 delete mode 100644 csrc/punica/punica_ops.h
 delete mode 100644 csrc/punica/torch_bindings.cpp
 delete mode 100644 csrc/punica/type_convert.h

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 60a3978f9abd..0a759d303238 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -13,8 +13,6 @@ $python_executable -m pip install -r requirements-cuda.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
-# Make sure punica is built for the release (for LoRA)
-export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ced73ca03bfb..df504a022cdf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -220,61 +220,7 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-#
-# _punica_C extension
-#
-
-set(VLLM_PUNICA_EXT_SRC
-  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
-  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
-  "csrc/punica/punica_ops.cu"
-  "csrc/punica/torch_bindings.cpp")
-
-#
-# Copy GPU compilation flags+update for punica
-#
-set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
-list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
-  "-D__CUDA_NO_HALF_OPERATORS__"
-  "-D__CUDA_NO_HALF_CONVERSIONS__"
-  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
-  "-D__CUDA_NO_HALF2_OPERATORS__")
-
-#
-# Filter out CUDA architectures < 8.0 for punica.
-#
-if (${VLLM_GPU_LANG} STREQUAL "CUDA")
-  set(VLLM_PUNICA_GPU_ARCHES)
-  foreach(ARCH ${VLLM_GPU_ARCHES})
-    string_to_ver(CODE_VER ${ARCH})
-    if (CODE_VER GREATER_EQUAL 8.0)
-      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
-    endif()
-  endforeach()
-  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
-elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
-  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
-  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
-endif()
 
-if (VLLM_PUNICA_GPU_ARCHES)
-  define_gpu_extension_target(
-    _punica_C
-    DESTINATION vllm
-    LANGUAGE ${VLLM_GPU_LANG}
-    SOURCES ${VLLM_PUNICA_EXT_SRC}
-    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
-    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
-    USE_SABI 3
-    WITH_SOABI)
-else()
-  message(WARNING "Unable to create _punica_C target because none of the "
-    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
-endif()
 
 #
 # Add the `default` target which detects which extensions should be
@@ -298,12 +244,4 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling moe extension.")
   add_dependencies(default _moe_C)
 
-  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
-  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
-  # there are supported target arches.
-  if (VLLM_PUNICA_GPU_ARCHES AND
-      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
-    message(STATUS "Enabling punica extension.")
-    add_dependencies(default _punica_C)
-  endif()
 endif()
diff --git a/Dockerfile b/Dockerfile
index 7fbc168ace3d..590b0554cae9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -85,8 +85,6 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 ARG buildkite_commit
 ENV BUILDKITE_COMMIT=${buildkite_commit}
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index befb0499f2e6..7e29a73010ab 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -150,8 +150,7 @@ RUN case "$(which python3)" in \
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --upgrade numba scipy huggingface-hub[cli]
 
-# Make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 # Silences the HF Tokenizers warning
diff --git a/csrc/punica/LICENSE b/csrc/punica/LICENSE
deleted file mode 100644
index a46e2cdcadf7..000000000000
--- a/csrc/punica/LICENSE
+++ /dev/null
@@ -1,217 +0,0 @@
-Contains code from https://github.com/punica-ai/punica
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-------------------------------------------------------------------------------------
-
-This product bundles various third-party components under other open source licenses.
-This section summarizes those components and their licenses. See licenses/
-for text of these licenses.
-
-
-Apache-2.0
-* third_party/nvbench (with LLVM exception)
-* third_party/flashinfer
-
-BSD-3-Clause:
-* third_party/cutlass
\ No newline at end of file
diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
deleted file mode 100644
index 86846c274c90..000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
deleted file mode 100644
index de39c3121f5d..000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
deleted file mode 100644
index 2c8d007d8719..000000000000
--- a/csrc/punica/bgmv/bgmv_config.h
+++ /dev/null
@@ -1,218 +0,0 @@
-#pragma once
-
-template <int feat_in, int feat_out, typename in_T, typename out_T,
-          typename W_T>
-void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                 const W_T *__restrict__ W,
-                 const int64_t *__restrict__ indicies, int64_t y_offset,
-                 int64_t full_y_size, int64_t batch_size, int64_t num_layers,
-                 int64_t layer_idx, float scale);
-
-// clang-format off
-
-#define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \
-    f(in_T, out_T, W_T, narrow, 128) \
-    f(in_T, out_T, W_T, narrow, 256) \
-    f(in_T, out_T, W_T, narrow, 512) \
-    f(in_T, out_T, W_T, narrow, 640) \
-    f(in_T, out_T, W_T, narrow, 768) \
-    f(in_T, out_T, W_T, narrow, 896) \
-    f(in_T, out_T, W_T, narrow, 1024) \
-    f(in_T, out_T, W_T, narrow, 1152) \
-    f(in_T, out_T, W_T, narrow, 1216) \
-    f(in_T, out_T, W_T, narrow, 1280) \
-    f(in_T, out_T, W_T, narrow, 1536) \
-    f(in_T, out_T, W_T, narrow, 1664) \
-    f(in_T, out_T, W_T, narrow, 1728) \
-    f(in_T, out_T, W_T, narrow, 1792) \
-    f(in_T, out_T, W_T, narrow, 2048) \
-    f(in_T, out_T, W_T, narrow, 2240) \
-    f(in_T, out_T, W_T, narrow, 2304) \
-    f(in_T, out_T, W_T, narrow, 2368) \
-    f(in_T, out_T, W_T, narrow, 2432) \
-    f(in_T, out_T, W_T, narrow, 2560) \
-    f(in_T, out_T, W_T, narrow, 2752) \
-    f(in_T, out_T, W_T, narrow, 2816) \
-    f(in_T, out_T, W_T, narrow, 3072) \
-    f(in_T, out_T, W_T, narrow, 3328) \
-    f(in_T, out_T, W_T, narrow, 3456) \
-    f(in_T, out_T, W_T, narrow, 3584) \
-    f(in_T, out_T, W_T, narrow, 3712) \
-    f(in_T, out_T, W_T, narrow, 4096) \
-    f(in_T, out_T, W_T, narrow, 4480) \
-    f(in_T, out_T, W_T, narrow, 4608) \
-    f(in_T, out_T, W_T, narrow, 4736) \
-    f(in_T, out_T, W_T, narrow, 4864) \
-    f(in_T, out_T, W_T, narrow, 5120) \
-    f(in_T, out_T, W_T, narrow, 5504) \
-    f(in_T, out_T, W_T, narrow, 5632) \
-    f(in_T, out_T, W_T, narrow, 5888) \
-    f(in_T, out_T, W_T, narrow, 6144) \
-    f(in_T, out_T, W_T, narrow, 6400) \
-    f(in_T, out_T, W_T, narrow, 6848) \
-    f(in_T, out_T, W_T, narrow, 6912) \
-    f(in_T, out_T, W_T, narrow, 7168) \
-    f(in_T, out_T, W_T, narrow, 7424) \
-    f(in_T, out_T, W_T, narrow, 8192) \
-    f(in_T, out_T, W_T, narrow, 8960) \
-    f(in_T, out_T, W_T, narrow, 9216) \
-    f(in_T, out_T, W_T, narrow, 9472) \
-    f(in_T, out_T, W_T, narrow, 10240) \
-    f(in_T, out_T, W_T, narrow, 11008) \
-    f(in_T, out_T, W_T, narrow, 11264) \
-    f(in_T, out_T, W_T, narrow, 12288) \
-    f(in_T, out_T, W_T, narrow, 13696) \
-    f(in_T, out_T, W_T, narrow, 13824) \
-    f(in_T, out_T, W_T, narrow, 14336) \
-    f(in_T, out_T, W_T, narrow, 14784) \
-    f(in_T, out_T, W_T, narrow, 14848) \
-    f(in_T, out_T, W_T, narrow, 15360) \
-    f(in_T, out_T, W_T, narrow, 16384) \
-    f(in_T, out_T, W_T, narrow, 18944) \
-    f(in_T, out_T, W_T, narrow, 20480) \
-    f(in_T, out_T, W_T, narrow, 22016) \
-    f(in_T, out_T, W_T, narrow, 22528) \
-    f(in_T, out_T, W_T, narrow, 24576) \
-    f(in_T, out_T, W_T, narrow, 27392) \
-    f(in_T, out_T, W_T, narrow, 27648) \
-    f(in_T, out_T, W_T, narrow, 28672) \
-    f(in_T, out_T, W_T, narrow, 29568) \
-    f(in_T, out_T, W_T, narrow, 29696) \
-    f(in_T, out_T, W_T, narrow, 32000) \
-    f(in_T, out_T, W_T, narrow, 32256) \
-    f(in_T, out_T, W_T, narrow, 32512) \
-    f(in_T, out_T, W_T, narrow, 32768) \
-    f(in_T, out_T, W_T, narrow, 33024) \
-    f(in_T, out_T, W_T, narrow, 36864) \
-    f(in_T, out_T, W_T, narrow, 43264) \
-    f(in_T, out_T, W_T, narrow, 49152) \
-    f(in_T, out_T, W_T, narrow, 49408) \
-    f(in_T, out_T, W_T, narrow, 60544) \
-    f(in_T, out_T, W_T, narrow, 60672) \
-    f(in_T, out_T, W_T, narrow, 64000) \
-    f(in_T, out_T, W_T, narrow, 64256) \
-    f(in_T, out_T, W_T, narrow, 64512) \
-    f(in_T, out_T, W_T, narrow, 102400) \
-    f(in_T, out_T, W_T, narrow, 102656) \
-    f(in_T, out_T, W_T, narrow, 102912) \
-    f(in_T, out_T, W_T, narrow, 128000) \
-    f(in_T, out_T, W_T, narrow, 128256) \
-    f(in_T, out_T, W_T, narrow, 128512) \
-    
-    
-// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
-// and vllm/tests/lora/test_punica.py
-
-// Used for defining kernels going from the variety of
-// dim in to the narrow dim out
-    // Using it for the fully sharded column
-    // parallel LoRA A which splits the rank dim
-#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
-    f(in_T, out_T, W_T, 128, narrow) \
-    f(in_T, out_T, W_T, 256, narrow) \
-    f(in_T, out_T, W_T, 512, narrow) \
-    f(in_T, out_T, W_T, 640, narrow) \
-    f(in_T, out_T, W_T, 768, narrow) \
-    f(in_T, out_T, W_T, 896, narrow) \
-    f(in_T, out_T, W_T, 1024, narrow) \
-    f(in_T, out_T, W_T, 1152, narrow) \
-    f(in_T, out_T, W_T, 1216, narrow) \
-    f(in_T, out_T, W_T, 1280, narrow) \
-    f(in_T, out_T, W_T, 1536, narrow) \
-    f(in_T, out_T, W_T, 1664, narrow) \
-    f(in_T, out_T, W_T, 1728, narrow) \
-    f(in_T, out_T, W_T, 1792, narrow) \
-    f(in_T, out_T, W_T, 2048, narrow) \
-    f(in_T, out_T, W_T, 2240, narrow) \
-    f(in_T, out_T, W_T, 2304, narrow) \
-    f(in_T, out_T, W_T, 2368, narrow) \
-    f(in_T, out_T, W_T, 2432, narrow) \
-    f(in_T, out_T, W_T, 2560, narrow) \
-    f(in_T, out_T, W_T, 2752, narrow) \
-    f(in_T, out_T, W_T, 2816, narrow) \
-    f(in_T, out_T, W_T, 3072, narrow) \
-    f(in_T, out_T, W_T, 3328, narrow) \
-    f(in_T, out_T, W_T, 3456, narrow) \
-    f(in_T, out_T, W_T, 3584, narrow) \
-    f(in_T, out_T, W_T, 3712, narrow) \
-    f(in_T, out_T, W_T, 4096, narrow) \
-    f(in_T, out_T, W_T, 4480, narrow) \
-    f(in_T, out_T, W_T, 4608, narrow) \
-    f(in_T, out_T, W_T, 4736, narrow) \
-    f(in_T, out_T, W_T, 4864, narrow) \
-    f(in_T, out_T, W_T, 5120, narrow) \
-    f(in_T, out_T, W_T, 5504, narrow) \
-    f(in_T, out_T, W_T, 5632, narrow) \
-    f(in_T, out_T, W_T, 5888, narrow) \
-    f(in_T, out_T, W_T, 6144, narrow) \
-    f(in_T, out_T, W_T, 6400, narrow) \
-    f(in_T, out_T, W_T, 6848, narrow) \
-    f(in_T, out_T, W_T, 6912, narrow) \
-    f(in_T, out_T, W_T, 7168, narrow) \
-    f(in_T, out_T, W_T, 7424, narrow) \
-    f(in_T, out_T, W_T, 8192, narrow) \
-    f(in_T, out_T, W_T, 8960, narrow) \
-    f(in_T, out_T, W_T, 9216, narrow) \
-    f(in_T, out_T, W_T, 9472, narrow) \
-    f(in_T, out_T, W_T, 10240, narrow) \
-    f(in_T, out_T, W_T, 11008, narrow) \
-    f(in_T, out_T, W_T, 11264, narrow) \
-    f(in_T, out_T, W_T, 12288, narrow) \
-    f(in_T, out_T, W_T, 13696, narrow) \
-    f(in_T, out_T, W_T, 13824, narrow) \
-    f(in_T, out_T, W_T, 14336, narrow) \
-    f(in_T, out_T, W_T, 14784, narrow) \
-    f(in_T, out_T, W_T, 14848, narrow) \
-    f(in_T, out_T, W_T, 15360, narrow) \
-    f(in_T, out_T, W_T, 16384, narrow) \
-    f(in_T, out_T, W_T, 18944, narrow) \
-    f(in_T, out_T, W_T, 20480, narrow) \
-    f(in_T, out_T, W_T, 22016, narrow) \
-    f(in_T, out_T, W_T, 22528, narrow) \
-    f(in_T, out_T, W_T, 24576, narrow) \
-    f(in_T, out_T, W_T, 27392, narrow) \
-    f(in_T, out_T, W_T, 27648, narrow) \
-    f(in_T, out_T, W_T, 28672, narrow) \
-    f(in_T, out_T, W_T, 29568, narrow) \
-    f(in_T, out_T, W_T, 29696, narrow) \
-    f(in_T, out_T, W_T, 32000, narrow) \
-    f(in_T, out_T, W_T, 32256, narrow) \
-    f(in_T, out_T, W_T, 32512, narrow) \
-    f(in_T, out_T, W_T, 32768, narrow) \
-    f(in_T, out_T, W_T, 33024, narrow) \
-    f(in_T, out_T, W_T, 36864, narrow) \
-    f(in_T, out_T, W_T, 43264, narrow) \
-    f(in_T, out_T, W_T, 49152, narrow) \
-    f(in_T, out_T, W_T, 49408, narrow) \
-    f(in_T, out_T, W_T, 60544, narrow) \
-    f(in_T, out_T, W_T, 60672, narrow) \
-    f(in_T, out_T, W_T, 64000, narrow) \
-    f(in_T, out_T, W_T, 64256, narrow) \
-    f(in_T, out_T, W_T, 64512, narrow) \
-    f(in_T, out_T, W_T, 102400, narrow) \
-    f(in_T, out_T, W_T, 102656, narrow) \
-    f(in_T, out_T, W_T, 102912, narrow) \
-    f(in_T, out_T, W_T, 128000, narrow) \
-    f(in_T, out_T, W_T, 128256, narrow) \
-    f(in_T, out_T, W_T, 128512, narrow) \
-// Keep above in sync with vllm/lora/layers::SamplerWithLoRA
-
-
-// Keep this in sync with vllm/config::LoRAConfig
-#define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8)  \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 16) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \
-    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64)
-
-
-#define FOR_INST_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 1) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 2) \
-    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 4) \
-    f(in_T, out_T, W_T, 8, 64) \
-    f(in_T, out_T, W_T, 16, 64) \
-    f(in_T, out_T, W_T, 32, 64) \
-    f(in_T, out_T, W_T, 64, 64)
-
-// clang-format on
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
deleted file mode 100644
index d225a1eaa82b..000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
deleted file mode 100644
index b37d288a7556..000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
deleted file mode 100644
index a1ab2deecbab..000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
deleted file mode 100644
index 0b35bf569989..000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_impl.cuh b/csrc/punica/bgmv/bgmv_impl.cuh
deleted file mode 100644
index 8a3b8403b4a6..000000000000
--- a/csrc/punica/bgmv/bgmv_impl.cuh
+++ /dev/null
@@ -1,451 +0,0 @@
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#ifndef USE_ROCM
-#include <cooperative_groups.h>
-#else
-#include <hip/hip_cooperative_groups.h>
-#endif
-#ifndef USE_ROCM
-#include <cuda/pipeline>
-#endif
-#include <cuda_runtime.h>
-#include <iostream>
-#include <stdio.h>
-
-#include "vec_dtypes.cuh"
-
-namespace cg = cooperative_groups;
-
-#ifdef USE_ROCM
-template <size_t len>
-__host__ __device__
-inline void* memcpy_blocking(void *dst, const void *src) {
-  // Does not handle the case of long datatypes
-  char *d = reinterpret_cast<char *>(dst);
-  const char *s = reinterpret_cast<const char *>(src);
-  size_t i = 0;
-#pragma unroll
-  for (i = 0; i < len; ++i) {
-    d[i] = s[i];
-  }
-  return dst;
-}
-#endif
-
-#ifndef USE_ROCM
-
-// nthrs = (32, 4)
-template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
-          size_t W_copy_size, int tx, int ty, int tz, typename in_T,
-          typename out_T, typename W_T>
-__global__ void
-bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-  if (idx < 0) {
-    return;
-  }
-
-  auto block = cg::this_thread_block();
-  size_t j = blockIdx.x;
-  constexpr size_t num_pipeline_stages = 2;
-  constexpr size_t tile_size = tx * ty * vec_size;
-  __shared__ W_T W_shared[num_pipeline_stages * tile_size];
-  __shared__ in_T X_shared[num_pipeline_stages * tile_size];
-  __shared__ float y_warpwise[ty];
-
-  size_t W_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
-  size_t X_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
-  auto pipe = cuda::make_pipeline();
-
-  // pipeline load W/X and compute WX;
-  pipe.producer_acquire();
-  cuda::memcpy_async(W_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     W + (idx * feat_out + j) * feat_in +
-                         (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
-  cuda::memcpy_async(X_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     X + (batch_idx * feat_in) +
-                         (threadIdx.y * tx + threadIdx.x) * vec_size,
-                     cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
-  pipe.producer_commit();
-  size_t copy_idx, compute_idx;
-  float y = 0.f;
-  vec_t<in_T, vec_size> x_vec;
-  vec_t<W_T, vec_size> w_vec;
-  size_t tile_idx;
-
-#pragma unroll
-  for (tile_idx = 1; tile_idx < (feat_in + tile_size - 1) / tile_size;
-       ++tile_idx) {
-    copy_idx = tile_idx % num_pipeline_stages;
-    // pipeline stage: async copy W fragment
-    pipe.producer_acquire();
-    if (tile_idx * tile_size + threadIdx.y * tx * vec_size < feat_in) {
-      cuda::memcpy_async(W_shared + W_shared_offset[copy_idx] +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         W + (idx * feat_out + j) * feat_in +
-                             tile_idx * tile_size +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
-      cuda::memcpy_async(X_shared + X_shared_offset[copy_idx] +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         X + (batch_idx * feat_in) + tile_idx * tile_size +
-                             (threadIdx.y * tx + threadIdx.x) * vec_size,
-                         cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
-    }
-    pipe.producer_commit();
-
-    compute_idx = (tile_idx - 1) % num_pipeline_stages;
-    // pipeline stage: compute WX
-    pipe.consumer_wait();
-    block.sync();
-    x_vec.load(X_shared + X_shared_offset[compute_idx] +
-               (threadIdx.y * tx + threadIdx.x) * vec_size);
-    w_vec.load(W_shared + W_shared_offset[compute_idx] +
-               (threadIdx.y * tx + threadIdx.x) * vec_size);
-    float sum = 0.f;
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-    }
-#pragma unroll
-    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-      sum += __shfl_down_sync(0xffffffff, sum, offset);
-    }
-    y_warpwise[threadIdx.y] = sum;
-    block.sync();
-#pragma unroll
-    for (size_t i = 0; i < ty; ++i) {
-      y += y_warpwise[i];
-    }
-
-    block.sync();
-    pipe.consumer_release();
-  }
-
-  compute_idx = (tile_idx - 1) % num_pipeline_stages;
-  // final pipeline stage
-  pipe.consumer_wait();
-  block.sync();
-  x_vec.load(X_shared + X_shared_offset[compute_idx] +
-             (threadIdx.y * tx + threadIdx.x) * vec_size);
-  w_vec.load(W_shared + W_shared_offset[compute_idx] +
-             (threadIdx.y * tx + threadIdx.x) * vec_size);
-  float sum = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-  }
-#pragma unroll
-  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-    sum += __shfl_down_sync(0xffffffff, sum, offset);
-  }
-  y_warpwise[threadIdx.y] =
-      ((tile_idx - 1) * tile_size + threadIdx.y * tx * vec_size < feat_in)
-          ? sum
-          : 0.f;
-  block.sync();
-#pragma unroll
-  for (size_t i = 0; i < ty; ++i) {
-    y += y_warpwise[i];
-  }
-
-  block.sync();
-  pipe.consumer_release();
-
-  // write Y;
-  if (block.thread_rank() == 0) {
-    Y[batch_idx * full_y_size + y_offset + j] += static_cast<out_T>(y);
-  }
-}
-
-#else
-
-template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
-          size_t W_copy_size, int tx, int ty, int tz, typename in_T,
-          typename out_T, typename W_T>
-__global__ void
-bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-  if (idx < 0) {
-    return;
-  }
-
-  size_t j = blockIdx.x;
-  constexpr size_t tile_size = tx * ty * vec_size;
-  constexpr size_t num_tiles = (feat_in + tile_size - 1) / tile_size;
-  __shared__ float y_warpwise[ty];
-
-  float y = 0;
-  vec_t<in_T, vec_size> x_vec;
-  vec_t<W_T, vec_size> w_vec;
-  size_t tile_idx;
-
-#pragma unroll
-  for (tile_idx = 0; tile_idx < num_tiles; ++tile_idx) {
-    if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) {
-      x_vec.load(X + (batch_idx * feat_in) +
-                     tile_idx * tile_size +
-                     (threadIdx.y * tx + threadIdx.x) * vec_size);
-      w_vec.load(W + (idx * feat_out + j) * feat_in +
-                     tile_idx * tile_size +
-                     (threadIdx.y * tx + threadIdx.x) * vec_size);
-    }
-
-    float sum = 0.f;
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      sum += convert_type<W_T, float>(w_vec[i]) * convert_type<in_T, float>(x_vec[i]) * scale;
-    }
-#pragma unroll
-    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-      sum += VLLM_SHFL_DOWN_SYNC(sum, offset);
-    }
-
-    __syncthreads();
-
-    if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) {
-      y += sum;
-    }
-  }
-
-  if (threadIdx.x == 0) {
-    y_warpwise[threadIdx.y] = y;
-  }
-  __syncthreads();
-
-  float y_write = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < ty; ++i) {
-    y_write += y_warpwise[i];
-  }
- 
-  // write Y;
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    size_t y_idx = batch_idx * full_y_size + y_offset + j;
-    Y[y_idx] = vllm_add<out_T>(Y[y_idx], convert_type<float, out_T>(y_write));
-  }
-}
-
-#endif
-
-// nthrs = (2, 16, 4)
-template <int feat_in, int feat_out, size_t vec_size, int tx, int ty, int tz,
-          typename in_T, typename out_T, typename W_T>
-__global__ void
-bgmv_expand_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                   const W_T *__restrict__ W,
-                   const int64_t *__restrict__ indicies, int64_t y_offset,
-                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
-                   float scale) {
-  size_t batch_idx = blockIdx.y;
-  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
-
-  if (idx < 0) {
-    return;
-  }
-
-  auto block = cg::this_thread_block();
-  size_t tile_idx = blockIdx.x;
-
-  // load X;
-  vec_t<in_T, vec_size> x_vec;
-  x_vec.load(X + batch_idx * feat_in + threadIdx.x * vec_size);
-
-  // load W;
-  vec_t<W_T, vec_size> w_vec;
-  w_vec.load(W + (idx * feat_out + tile_idx * tz * ty) * feat_in +
-             block.thread_rank() * vec_size);
-
-  float sum = 0.f;
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-#ifndef USE_ROCM
-    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
-#else
-    sum += convert_type<W_T, float>(w_vec[i]) * convert_type<in_T, float>(x_vec[i]) * scale;
-#endif
-  }
-
-  cg::thread_block_tile g = cg::tiled_partition<tx>(block);
-#pragma unroll
-  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
-    sum += g.shfl_down(sum, offset);
-  }
-  sum = g.shfl(sum, 0);
-
-  if (threadIdx.x == 0) {
-#ifndef USE_ROCM
-    Y[batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
-      threadIdx.z * ty + threadIdx.y] += static_cast<out_T>(sum);
-#else
-    size_t y_idx = batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
-                   threadIdx.z * ty + threadIdx.y;
-    Y[y_idx] = vllm_add<out_T>(Y[y_idx], convert_type<float, out_T>(sum));
-#endif
-  }
-}
-
-template <int feat_in, int feat_out, typename in_T, typename out_T,
-          typename W_T>
-void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
-                 const W_T *__restrict__ W,
-                 const int64_t *__restrict__ indicies, int64_t y_offset,
-                 int64_t full_y_size, int64_t batch_size, int64_t num_layers,
-                 int64_t layer_idx, float scale) {
-  constexpr size_t vec_size = 8;
-  constexpr int tz = 4;
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  if constexpr (feat_in <= feat_out) {
-    static_assert(feat_in % vec_size == 0);
-    constexpr int tx = feat_in / vec_size;
-
-    static_assert((32 % tx == 0 && feat_out % (32 / tx * tz) == 0) ||
-                  (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) ||
-                  (8 % tx == 0 && feat_out % (8 / tx * tz) == 0));
-
-    if constexpr (32 % tx == 0 && feat_out % (32 / tx * tz) == 0) {
-      constexpr int ty = 32 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) {
-      constexpr int ty = 16 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else {
-      constexpr int ty = 8 / tx;
-      dim3 nblks(feat_out / (ty * tz), batch_size);
-      dim3 nthrs(tx, ty, tz);
-
-      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    }
-  } else {
-#ifndef USE_ROCM
-    static_assert(feat_in % (vec_size * 32) == 0 ||
-                  feat_in % (vec_size * 16) == 0 ||
-                  feat_in % (vec_size * 8) == 0);
-
-    if constexpr (feat_in % (vec_size * 32) == 0) {
-      constexpr int tx = 32;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size, vec_size * sizeof(in_T),
-                         vec_size * sizeof(W_T), tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if constexpr (feat_in % (vec_size / 2 * 32) == 0) {
-      constexpr int tx = 32;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
-                         vec_size * sizeof(in_T) / 2,
-                         vec_size * sizeof(W_T) / 2, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    } else if constexpr (feat_in % (vec_size / 2 * 16) == 0) {
-      constexpr int tx = 16;
-      constexpr int ty = 4;
-
-      dim3 nblks(feat_out, batch_size);
-      dim3 nthrs(tx, ty);
-
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
-                         vec_size * sizeof(in_T) / 2,
-                         vec_size * sizeof(W_T) / 2, tx, ty, tz>
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
-                                        full_y_size, num_layers, layer_idx,
-                                        scale);
-    }
-#else
-    constexpr size_t rocm_warp_size = warpSize;
-
-#define CHECK_INPUT_TILEABLE_BY(vec_size_) \
-    feat_in % (rocm_warp_size * vec_size_) == 0
-
-#define LAUNCH_BGMV_SHRINK_KERNELS_ROCM(factor_, vec_size_, tx_, ty_)       \
-    if constexpr (CHECK_INPUT_TILEABLE_BY(factor_)) {                       \
-      constexpr size_t vec_size_shrink = vec_size_;                         \
-      constexpr int tx = tx_;                                               \
-      constexpr int ty = ty_;                                               \
-      dim3 nblks(feat_out, batch_size);                                     \
-      dim3 nthrs(tx, ty);                                                   \
-      bgmv_shrink_kernel<feat_in, feat_out, vec_size_shrink,                \
-                          vec_size_shrink * sizeof(in_T),                   \
-                          vec_size_shrink * sizeof(W_T),                    \
-                          tx, ty, tz>                                       \
-          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,        \
-                                        full_y_size, num_layers, layer_idx, \
-                                        scale);                             \
-    }
-
-    static_assert(CHECK_INPUT_TILEABLE_BY(32) ||
-                  CHECK_INPUT_TILEABLE_BY(16) ||
-                  CHECK_INPUT_TILEABLE_BY( 8) ||
-                  CHECK_INPUT_TILEABLE_BY( 4) ||
-                  CHECK_INPUT_TILEABLE_BY( 2) ||
-                  CHECK_INPUT_TILEABLE_BY( 1));
-    
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM(32, vec_size, rocm_warp_size, 32/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM(16, vec_size, rocm_warp_size, 16/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 8, vec_size, rocm_warp_size,  8/vec_size)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 4, vec_size, rocm_warp_size/(vec_size/4), vec_size/4)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 2, vec_size, rocm_warp_size/(vec_size/2), vec_size/2)
-    else
-    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 1, vec_size, rocm_warp_size/(vec_size/1), vec_size/1)
-
-#undef CHECK_INPUT_TILEABLE_BY
-#undef LAUNCH_BGMV_SHRINK_KERNELS_ROCM
-#endif
-  }
-}
-
-#define INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)                         \
-  template void bgmv_kernel<feat_in, feat_out>(                                \
-      out_T * __restrict__ Y, const in_T *__restrict__ X,                      \
-      const W_T *__restrict__ W, const int64_t *__restrict__ indicies,         \
-      int64_t y_offset, int64_t full_y_size, int64_t batch_size,               \
-      int64_t num_layers, int64_t layer_idx, float scale);
-
-#define INST_BGMV_ONESIDE(in_T, out_T, W_T, feat_in, feat_out)                 \
-  INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)
-
-#define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide)                      \
-  INST_BGMV(narrow, wide, in_T, out_T, W_T)                                    \
-  INST_BGMV(wide, narrow, in_T, out_T, W_T)
diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py
deleted file mode 100644
index 972df5a7208c..000000000000
--- a/csrc/punica/bgmv/generator.py
+++ /dev/null
@@ -1,48 +0,0 @@
-DTYPES = ["fp16", "bf16", "fp32"]
-DTYPE_MAP = {
-    "fp16": "nv_half",
-    "bf16": "nv_bfloat16",
-    "fp32": "float",
-}
-
-TEMPLATE = """
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-""".lstrip()  # noqa: E501
-
-for input_dtype in DTYPES:
-    for output_dtype in DTYPES:
-        for weight_dtype in DTYPES:
-            if weight_dtype == "fp32":
-                # FP32 weights are not supported.
-                continue
-            if output_dtype == "fp32":
-                # LoRA A matrix.
-                if input_dtype != weight_dtype:
-                    # NOTE(woosuk): While Punica supports the case where the
-                    # input and weight dtypes are different, we only generate
-                    # the kernels the same dtypes to reduce the binary size.
-                    continue
-            elif input_dtype == "fp32":
-                # LoRA B matrix.
-                if output_dtype != weight_dtype:
-                    # NOTE(woosuk): While Punica supports the case where the
-                    # output and weight dtypes are different, we only generate
-                    # the kernels the same dtypes to reduce the binary size.
-                    continue
-            elif not (input_dtype == output_dtype == weight_dtype):
-                # NOTE(woosuk): While Punica supports mixed data types for
-                # input, output, and weight, we only generate the kernels with
-                # the same data types to reduce the binary size.
-                continue
-
-            kernel_definition = TEMPLATE.format(
-                input_dtype=DTYPE_MAP[input_dtype],
-                output_dtype=DTYPE_MAP[output_dtype],
-                weight_dtype=DTYPE_MAP[weight_dtype])
-            filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu"
-            with open(filename, "w") as f:
-                f.write(kernel_definition)
diff --git a/csrc/punica/bgmv/vec_dtypes.cuh b/csrc/punica/bgmv/vec_dtypes.cuh
deleted file mode 100644
index 2738892e6dc4..000000000000
--- a/csrc/punica/bgmv/vec_dtypes.cuh
+++ /dev/null
@@ -1,1325 +0,0 @@
-#ifndef VEC_DTYPES_CUH_
-#define VEC_DTYPES_CUH_
-
-#ifdef FLASHINFER_USE_FP8
-#include <cuda_fp8.h>
-#endif
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-#include "../type_convert.h"
-#include "../../cuda_compat.h"
-
-#define FLASHINFER_INLINE \
-  inline __attribute__((always_inline)) __device__ __host__
-
-template <typename float_t, size_t vec_size>
-struct vec_t {
-  FLASHINFER_INLINE float_t &operator[](size_t i);
-  FLASHINFER_INLINE const float_t &operator[](size_t i) const;
-  FLASHINFER_INLINE void fill(float_t val);
-  FLASHINFER_INLINE void load(const float_t *ptr);
-  FLASHINFER_INLINE void store(float_t *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src);
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr);
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const;
-  FLASHINFER_INLINE static void memcpy(float_t *dst, const float_t *src);
-};
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<src_float_t, vec_size> &src,
-                                      vec_t<tgt_float_t, vec_size> &dst) {
-#pragma unroll
-  for (size_t i = 0; i < vec_size; ++i) {
-    dst[i] = tgt_float_t(src[i]);
-  }
-}
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_load_impl(const src_float_t *src_ptr,
-                                      vec_t<tgt_float_t, vec_size> &dst) {
-  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
-    dst.load(src_ptr);
-  } else {
-    vec_t<src_float_t, vec_size> tmp;
-    tmp.load(src_ptr);
-    dst.cast_from(tmp);
-  }
-}
-
-template <typename src_float_t, typename tgt_float_t, size_t vec_size>
-FLASHINFER_INLINE void cast_store_impl(const vec_t<src_float_t, vec_size> &src,
-                                       tgt_float_t *dst_ptr) {
-  if constexpr (std::is_same<src_float_t, tgt_float_t>::value) {
-    src.store(dst_ptr);
-  } else {
-    vec_t<tgt_float_t, vec_size> tmp;
-    tmp.cast_from(src);
-    tmp.store(dst_ptr);
-  }
-}
-
-#ifdef FLASHINFER_USE_FP8
-/******************* vec_t<__nv_fp8_e4m3> *******************/
-
-// __nv_fp8_e4m3 x 1
-template <>
-struct vec_t<__nv_fp8_e4m3, 1> {
-  __nv_fp8_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::fill(__nv_fp8_e4m3 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *dst = *src;
-}
-
-// __nv_fp8_e4m3 x 2
-template <>
-struct vec_t<__nv_fp8_e4m3, 2> {
-  __nv_fp8x2_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::fill(__nv_fp8_e4m3 val) {
-  data.__x =
-      (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((__nv_fp8x2_e4m3 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((__nv_fp8x2_e4m3 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8x2_e4m3 *)dst) = *((__nv_fp8x2_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 4
-
-template <>
-struct vec_t<__nv_fp8_e4m3, 4> {
-  __nv_fp8x4_e4m3 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::fill(__nv_fp8_e4m3 val) {
-  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-             (__nv_fp8x4_storage_t(val.__x) << 16) |
-             (__nv_fp8x4_storage_t(val.__x) << 8) |
-             __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((__nv_fp8x4_e4m3 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((__nv_fp8x4_e4m3 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8x4_e4m3 *)dst) = *((__nv_fp8x4_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 8
-
-template <>
-struct vec_t<__nv_fp8_e4m3, 8> {
-  uint2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 8> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::fill(__nv_fp8_e4m3 val) {
-  ((__nv_fp8x4_e4m3 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-  ((__nv_fp8x4_e4m3 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::load(const __nv_fp8_e4m3 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::store(
-    __nv_fp8_e4m3 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::memcpy(
-    __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) {
-  *((__nv_fp8_e4m3 *)dst) = *((__nv_fp8_e4m3 *)src);
-}
-
-// __nv_fp8_e4m3 x 16 or more
-template <size_t vec_size>
-struct vec_t<__nv_fp8_e4m3, vec_size> {
-  uint4 data[vec_size / 16];
-
-  FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) {
-    return ((__nv_fp8_e4m3 *)data)[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const {
-    return ((const __nv_fp8_e4m3 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((__nv_fp8x4_e4m3 *)(&(data[i].x)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].y)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].z)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e4m3 *)(&(data[i].w)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-    }
-  }
-  FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst,
-                                       const __nv_fp8_e4m3 *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<__nv_fp8_e5m2> *******************/
-
-// __nv_fp8_e5m2 x 1
-template <>
-struct vec_t<__nv_fp8_e5m2, 1> {
-  __nv_fp8_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::fill(__nv_fp8_e5m2 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *dst = *src;
-}
-
-// __nv_fp8_e5m2 x 2
-template <>
-struct vec_t<__nv_fp8_e5m2, 2> {
-  __nv_fp8x2_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::fill(__nv_fp8_e5m2 val) {
-  data.__x =
-      (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((__nv_fp8x2_e5m2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((__nv_fp8x2_e5m2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8x2_e5m2 *)dst) = *((__nv_fp8x2_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 4
-
-template <>
-struct vec_t<__nv_fp8_e5m2, 4> {
-  __nv_fp8x4_e5m2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::fill(__nv_fp8_e5m2 val) {
-  data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-             (__nv_fp8x4_storage_t(val.__x) << 16) |
-             (__nv_fp8x4_storage_t(val.__x) << 8) |
-             __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((__nv_fp8x4_e5m2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((__nv_fp8x4_e5m2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8x4_e5m2 *)dst) = *((__nv_fp8x4_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 8
-
-template <>
-struct vec_t<__nv_fp8_e5m2, 8> {
-  uint2 data;
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val);
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr);
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 8> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src);
-};
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::fill(__nv_fp8_e5m2 val) {
-  ((__nv_fp8x4_e5m2 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-  ((__nv_fp8x4_e5m2 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 16) |
-                                        (__nv_fp8x4_storage_t(val.__x) << 8) |
-                                        __nv_fp8x4_storage_t(val.__x);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::load(const __nv_fp8_e5m2 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::store(
-    __nv_fp8_e5m2 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::memcpy(
-    __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) {
-  *((__nv_fp8_e5m2 *)dst) = *((__nv_fp8_e5m2 *)src);
-}
-
-// __nv_fp8_e5m2 x 16 or more
-
-template <size_t vec_size>
-struct vec_t<__nv_fp8_e5m2, vec_size> {
-  uint4 data[vec_size / 16];
-
-  FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) {
-    return ((__nv_fp8_e5m2 *)data)[i];
-  }
-  FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const {
-    return ((const __nv_fp8_e5m2 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((__nv_fp8x4_e5m2 *)(&(data[i].x)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].y)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].z)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-      ((__nv_fp8x4_e5m2 *)(&(data[i].w)))->__x =
-          (__nv_fp8x4_storage_t(val.__x) << 24) |
-          (__nv_fp8x4_storage_t(val.__x) << 16) |
-          (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x);
-    }
-  }
-  FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst,
-                                       const __nv_fp8_e5m2 *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 16; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-#endif
-
-/******************* vec_t<half> *******************/
-
-// half x 1
-template <>
-struct vec_t<half, 1> {
-  half data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 1>::fill(half val) { data = val; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::load(const half *ptr) { data = *ptr; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::store(half *ptr) const { *ptr = data; }
-
-FLASHINFER_INLINE void vec_t<half, 1>::memcpy(half *dst, const half *src) {
-  *dst = *src;
-}
-
-// half x 2
-template <>
-struct vec_t<half, 2> {
-  half2 data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 2>::fill(half val) {
-  data = make_half2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::load(const half *ptr) {
-  data = *((half2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::store(half *ptr) const {
-  *((half2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<half, 2>::memcpy(half *dst, const half *src) {
-  *((half2 *)dst) = *((half2 *)src);
-}
-
-// half x 4
-
-template <>
-struct vec_t<half, 4> {
-  uint2 data;
-
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(half val);
-  FLASHINFER_INLINE void load(const half *ptr);
-  FLASHINFER_INLINE void store(half *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src);
-};
-
-FLASHINFER_INLINE void vec_t<half, 4>::fill(half val) {
-  *(half2 *)(&data.x) = make_half2(val, val);
-  *(half2 *)(&data.y) = make_half2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::load(const half *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::store(half *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<half, 4>::memcpy(half *dst, const half *src) {
-  *((uint2 *)dst) = *((uint2 *)src);
-}
-
-// half x 8 or more
-
-template <size_t vec_size>
-struct vec_t<half, vec_size> {
-  uint4 data[vec_size / 8];
-  FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)data)[i]; }
-  FLASHINFER_INLINE const half &operator[](size_t i) const {
-    return ((const half *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(half val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size; ++i) {
-      *(half2 *)(&(data[i].x)) = make_half2(val, val);
-      *(half2 *)(&(data[i].y)) = make_half2(val, val);
-      *(half2 *)(&(data[i].z)) = make_half2(val, val);
-      *(half2 *)(&(data[i].w)) = make_half2(val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const half *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(half *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(half *dst, const half *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<nv_bfloat16> *******************/
-
-// nv_bfloat16 x 1
-template <>
-struct vec_t<nv_bfloat16, 1> {
-  nv_bfloat16 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::fill(nv_bfloat16 val) {
-  data = val;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16 *ptr) {
-  data = *ptr;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16 *ptr) const {
-  *ptr = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *dst = *src;
-}
-
-// nv_bfloat16 x 2
-template <>
-struct vec_t<nv_bfloat16, 2> {
-  nv_bfloat162 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::fill(nv_bfloat16 val) {
-  data = make_bfloat162(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16 *ptr) {
-  data = *((nv_bfloat162 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16 *ptr) const {
-  *((nv_bfloat162 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *((nv_bfloat162 *)dst) = *((nv_bfloat162 *)src);
-}
-
-// nv_bfloat16 x 4
-
-template <>
-struct vec_t<nv_bfloat16, 4> {
-  uint2 data;
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val);
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr);
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src);
-};
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::fill(nv_bfloat16 val) {
-  *(nv_bfloat162 *)(&data.x) = make_bfloat162(val, val);
-  *(nv_bfloat162 *)(&data.y) = make_bfloat162(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16 *ptr) {
-  data = *((uint2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16 *ptr) const {
-  *((uint2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::memcpy(nv_bfloat16 *dst,
-                                                     const nv_bfloat16 *src) {
-  *((uint2 *)dst) = *((uint2 *)src);
-}
-
-// nv_bfloat16 x 8 or more
-
-template <size_t vec_size>
-struct vec_t<nv_bfloat16, vec_size> {
-  uint4 data[vec_size / 8];
-
-  FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) {
-    return ((nv_bfloat16 *)data)[i];
-  }
-  FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const {
-    return ((const nv_bfloat16 *)data)[i];
-  }
-  FLASHINFER_INLINE void fill(nv_bfloat16 val) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      *(nv_bfloat162 *)(&(data[i].x)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].y)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].z)) = make_bfloat162(val, val);
-      *(nv_bfloat162 *)(&(data[i].w)) = make_bfloat162(val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const nv_bfloat16 *ptr) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      data[i] = ((uint4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst,
-                                       const nv_bfloat16 *src) {
-#pragma unoll
-    for (size_t i = 0; i < vec_size / 8; ++i) {
-      ((uint4 *)dst)[i] = ((uint4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t<float> *******************/
-
-// float x 1
-
-template <>
-struct vec_t<float, 1> {
-  float data;
-
-  FLASHINFER_INLINE float &operator[](size_t i) {
-    return ((float *)(&data))[i];
-  }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val);
-  FLASHINFER_INLINE void load(const float *ptr);
-  FLASHINFER_INLINE void store(float *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src);
-};
-
-FLASHINFER_INLINE void vec_t<float, 1>::fill(float val) { data = val; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::load(const float *ptr) { data = *ptr; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::store(float *ptr) const { *ptr = data; }
-
-FLASHINFER_INLINE void vec_t<float, 1>::memcpy(float *dst, const float *src) {
-  *dst = *src;
-}
-
-// float x 2
-
-template <>
-struct vec_t<float, 2> {
-  float2 data;
-
-  FLASHINFER_INLINE float &operator[](size_t i) {
-    return ((float *)(&data))[i];
-  }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(&data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val);
-  FLASHINFER_INLINE void load(const float *ptr);
-  FLASHINFER_INLINE void store(float *ptr) const;
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src);
-};
-
-FLASHINFER_INLINE void vec_t<float, 2>::fill(float val) {
-  data = make_float2(val, val);
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::load(const float *ptr) {
-  data = *((float2 *)ptr);
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::store(float *ptr) const {
-  *((float2 *)ptr) = data;
-}
-
-FLASHINFER_INLINE void vec_t<float, 2>::memcpy(float *dst, const float *src) {
-  *((float2 *)dst) = *((float2 *)src);
-}
-
-// float x 4 or more
-template <size_t vec_size>
-struct vec_t<float, vec_size> {
-  float4 data[vec_size / 4];
-
-  FLASHINFER_INLINE float &operator[](size_t i) { return ((float *)(data))[i]; }
-  FLASHINFER_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(data))[i];
-  }
-  FLASHINFER_INLINE void fill(float val) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      data[i] = make_float4(val, val, val, val);
-    }
-  }
-  FLASHINFER_INLINE void load(const float *ptr) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      data[i] = ((float4 *)ptr)[i];
-    }
-  }
-  FLASHINFER_INLINE void store(float *ptr) const {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)ptr)[i] = data[i];
-    }
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) {
-    cast_from_impl(src, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_load(const T *ptr) {
-    cast_load_impl(ptr, *this);
-  }
-  template <typename T>
-  FLASHINFER_INLINE void cast_store(T *ptr) const {
-    cast_store_impl(*this, ptr);
-  }
-  FLASHINFER_INLINE static void memcpy(float *dst, const float *src) {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)dst)[i] = ((float4 *)src)[i];
-    }
-  }
-};
-
-/******************* vec_t type cast *******************/
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((float2 *)(&dst.data))[i] = __half22float2(((half2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = half(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = __float22half2_rn(((float2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<nv_bfloat16, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((float2 *)(&dst.data))[i] =
-          __bfloat1622float2(((nv_bfloat162 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<nv_bfloat16, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = nv_bfloat16(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((nv_bfloat162 *)(&dst.data))[i] =
-          __float22bfloat162_rn(((float2 *)(&src.data))[i]);
-    }
-  }
-}
-
-#ifdef FLASHINFER_USE_FP8
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e4m3 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e4m3 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e4m3 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<__nv_fp8_e4m3, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(float2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((__nv_fp8x4_e4m3 *)(&dst.data))[i] =
-          __nv_fp8x4_e4m3(((float4 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<__nv_fp8_e4m3, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(half2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      // NOTE(Zihao): need to double check if we properly handle flo and fhi
-      ((__nv_fp8x4_e4m3 *)(&dst.data))[i] = __nv_fp8x4_e4m3(
-          ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src,
-                                      vec_t<float, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e5m2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e5m2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src,
-                                      vec_t<half, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = float(src.data);
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 2; ++i) {
-      ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e5m2 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src,
-                                      vec_t<__nv_fp8_e5m2, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e5m2(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(float2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((__nv_fp8x4_e5m2 *)(&dst.data))[i] =
-          __nv_fp8x4_e5m2(((float4 *)(&src.data))[i]);
-    }
-  }
-}
-
-template <size_t vec_size>
-FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src,
-                                      vec_t<__nv_fp8_e5m2, vec_size> &dst) {
-  if constexpr (vec_size == 1) {
-    dst.data = __nv_fp8_e4m3(src.data);
-  } else if constexpr (vec_size == 2) {
-    *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(half2 *)(&src.data));
-  } else {
-#pragma unroll
-    for (size_t i = 0; i < vec_size / 4; ++i) {
-      // NOTE(Zihao): need to double check if we properly handle flo and fhi
-      ((__nv_fp8x4_e5m2 *)(&dst.data))[i] = __nv_fp8x4_e5m2(
-          ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]);
-    }
-  }
-}
-
-#endif  // FLASHINFER_USE_FP8
-
-#endif  // VEC_DTYPES_CUH_
diff --git a/csrc/punica/punica_ops.cu b/csrc/punica/punica_ops.cu
deleted file mode 100644
index dd29820144b3..000000000000
--- a/csrc/punica/punica_ops.cu
+++ /dev/null
@@ -1,569 +0,0 @@
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cstdint>
-
-#include "type_convert.h"
-#include "../cuda_compat.h"
-#include "bgmv/bgmv_config.h"
-
-
-//====== utils ======
-
-inline void check_shape(const torch::Tensor &a, const torch::Tensor &b,
-                        const char *a_name, const char *b_name) {
-  TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ",
-              a.dim(), " vs ", b.dim());
-  for (int i = 0; i < a.dim(); ++i) {
-    TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name,
-                ".size(", i, ")");
-  }
-}
-
-inline constexpr uint64_t pack_u32(uint32_t a, uint32_t b) {
-  return (uint64_t(a) << 32) | uint64_t(b);
-}
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
-
-#define CHECK_CONTIGUOUS(x)                                                    \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-#define CHECK_DIM(d, x)                                                        \
-  TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
-
-#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b)
-
-#define CHECK_EQ(a, b)                                                         \
-  TORCH_CHECK(a == b, "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
-
-//====== bgmv ======
-
-template <typename in_T, typename out_T, typename W_T>
-inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
-                               const int64_t *lora_indices,
-                               uint32_t in_features, uint32_t out_features,
-                               int64_t y_offset, int64_t full_y_size,
-                               int64_t batch_size, int64_t num_layers,
-                               int64_t layer_idx, float scale) {
-  // NOTE(woosuk): While Punica supports various combinations of input/output
-  // data types, we limit the supported data types to reduce the binary size.
-  constexpr bool is_input_float = std::is_same<in_T, float>::value;
-  constexpr bool is_output_float = std::is_same<out_T, float>::value;
-  if (is_input_float) {
-    if (!std::is_same<out_T, W_T>::value) {
-      return false;
-    }
-  } else if (is_output_float) {
-    if (!std::is_same<in_T, W_T>::value) {
-      return false;
-    }
-  } else if (!(std::is_same<in_T, W_T>::value &&
-               std::is_same<out_T, W_T>::value)) {
-    return false;
-  }
-
-  switch (pack_u32(in_features, out_features)) {
-#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out)                   \
-  case pack_u32(feat_in, feat_out):                                            \
-    bgmv_kernel<feat_in, feat_out>(Y, X, W, lora_indices, y_offset,            \
-                                   full_y_size, batch_size, num_layers,        \
-                                   layer_idx, scale);                          \
-    break;
-#define CASE(_in_T, _out_T, _W_T, narrow, wide)                                \
-  CASE_ONESIDE(in_T, out_T, W_T, narrow, wide)                                 \
-  CASE_ONESIDE(in_T, out_T, W_T, wide, narrow)
-
-    FOR_BGMV_WIDE_NARROW(CASE, _, _, _)
-    FOR_INST_BGMV_WIDE_NARROW(CASE_ONESIDE, _, _, _)
-#undef CASE
-#undef CASE_ONESIDE
-  default:
-    return false;
-  }
-  return true;
-}
-
-void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale) {
-  CHECK_INPUT(y);
-  CHECK_INPUT(x);
-  CHECK_INPUT(w);
-  CHECK_INPUT(indicies);
-
-  CHECK_DIM(2, y);
-  CHECK_DIM(2, x);
-  CHECK_DIM(4, w);
-  CHECK_DIM(1, indicies);
-
-  int64_t B = x.size(0);
-  int64_t h_in = x.size(1);
-  int64_t h_out = y.size(1);
-  int64_t num_layers = w.size(1);
-  CHECK_EQ(w.size(3), h_in);
-  CHECK_EQ(w.size(2), h_out);
-  CHECK_EQ(indicies.size(0), x.size(0));
-  CHECK_EQ(y.size(0), x.size(0));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-  bool ok = false;
-  if (h_in <= 128512 && h_out <= 128512) {
-    // TODO: See if we can get rid of this massive nested switch
-    switch (x.scalar_type()) {
-    case at::ScalarType::Half:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::BFloat16:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::Float:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
-                                  h_out, B, num_layers, layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    default:
-      break;
-    }
-  }
-  TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
-              " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type());
-}
-
-void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
-                             int64_t y_offset) {
-  CHECK_INPUT(y);
-  CHECK_INPUT(x);
-  CHECK_INPUT(w);
-  CHECK_INPUT(indicies);
-
-  CHECK_DIM(2, y);
-  CHECK_DIM(2, x);
-  CHECK_DIM(4, w);
-  CHECK_DIM(1, indicies);
-
-  int64_t B = x.size(0);
-  int64_t num_layers = w.size(1);
-  int64_t full_y_size = y.size(1);
-  CHECK_EQ(w.size(3), h_in);
-  CHECK_EQ(w.size(2), h_out);
-  CHECK_EQ(indicies.size(0), x.size(0));
-  CHECK_EQ(y.size(0), x.size(0));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-  bool ok = false;
-  if (h_in <= 128512 && h_out <= 128512) {
-    // TODO: See if we can get rid of this massive nested switch
-    switch (x.scalar_type()) {
-    case at::ScalarType::Half:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_half *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::BFloat16:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    case at::ScalarType::Float:
-      switch (y.scalar_type()) {
-      case at::ScalarType::Half:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::BFloat16:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      case at::ScalarType::Float:
-        switch (w.scalar_type()) {
-        case at::ScalarType::Half:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_half *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        case at::ScalarType::BFloat16:
-          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
-                                  static_cast<float *>(x.data_ptr()),
-                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
-                                  indicies.data_ptr<int64_t>(), h_in, h_out,
-                                  y_offset, full_y_size, B, num_layers,
-                                  layer_idx, scale);
-          break;
-        default:
-          break;
-        }
-        break;
-      default:
-        break;
-      }
-      break;
-    default:
-      break;
-    }
-  }
-  TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
-              " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type());
-}
diff --git a/csrc/punica/punica_ops.h b/csrc/punica/punica_ops.h
deleted file mode 100644
index 5d625d0564f7..000000000000
--- a/csrc/punica/punica_ops.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-
-void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale);
-
-void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
-                             int64_t y_offset);
diff --git a/csrc/punica/torch_bindings.cpp b/csrc/punica/torch_bindings.cpp
deleted file mode 100644
index 894e229b6d9d..000000000000
--- a/csrc/punica/torch_bindings.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "registration.h"
-#include "punica_ops.h"
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
-  m.def(
-      "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
-      "layer_idx, float scale) -> ()");
-  m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
-
-  m.def(
-      "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
-      "Tensor indicies, int layer_idx,"
-      "float scale, int h_in, int h_out,"
-      "int y_offset) -> ()");
-  m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/punica/type_convert.h b/csrc/punica/type_convert.h
deleted file mode 100644
index dff7ce49283d..000000000000
--- a/csrc/punica/type_convert.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CSRC__PUNICA__TYPE_CONVERT_H__
-#define CSRC__PUNICA__TYPE_CONVERT_H__
-
-#ifndef USE_ROCM
-
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
-#else
-
-#include <hip/hip_bf16.h>
-#include <hip/hip_fp16.h>
-
-#define __TYPE_CONVERT__HOST_DEVICE__ __host__ __device__
-
-typedef __half nv_half;
-typedef __hip_bfloat16 nv_bfloat16;
-typedef __hip_bfloat162 nv_bfloat162;
-
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 val) {
-  return __hip_bfloat162{val, val};
-}
-
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 vall, __hip_bfloat16 valr) {
-  return __hip_bfloat162{vall, valr};
-}
-
-template <typename T_src, typename T_dst>
-__TYPE_CONVERT__HOST_DEVICE__
-inline T_dst convert_type(T_src val) {
-  return static_cast<T_dst>(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline float convert_type<__half, float>(__half val) {
-  return __half2float(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __half convert_type<float, __half>(float val) {
-  return __float2half(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline float convert_type<__hip_bfloat16, float>(__hip_bfloat16 val) {
-  return __bfloat162float(val);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat16 convert_type<float, __hip_bfloat16>(float val) {
-  return __float2bfloat16(val);
-}
-
-template <typename T>
-__TYPE_CONVERT__HOST_DEVICE__
-inline T vllm_add(T a, T b) {
-  return a + b;
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __half vllm_add<__half>(__half a, __half b) {
-  return __hadd(a, b);
-}
-
-template <>
-__TYPE_CONVERT__HOST_DEVICE__
-inline __hip_bfloat16 vllm_add<__hip_bfloat16>(__hip_bfloat16 a, __hip_bfloat16 b) {
-  return __hadd(a, b);
-}
-
-#undef __TYPE_CONVERT__HOST_DEVICE__
-
-#endif // USE_ROCM
-
-#endif // CSRC__PUNICA__TYPE_CONVERT_H__
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index a9dfac8ff5af..99cf34622ab9 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -67,7 +67,6 @@ You can also build and install vLLM from source:
 
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
-    $ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability
     $ pip install -e .  # This may take 5-10 minutes.
 
 .. tip::
diff --git a/setup.py b/setup.py
index 72ef26f15e40..63c1f466d291 100644
--- a/setup.py
+++ b/setup.py
@@ -181,9 +181,6 @@ def configure(self, ext: CMakeExtension) -> None:
         # match.
         cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
 
-        if _install_punica():
-            cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON']
-
         #
         # Setup parallelism and build tool
         #
@@ -274,10 +271,6 @@ def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
 
-def _install_punica() -> bool:
-    return envs.VLLM_INSTALL_PUNICA_KERNELS
-
-
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -446,9 +439,6 @@ def _read_requirements(filename: str) -> List[str]:
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
-    if _install_punica():
-        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
-
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
diff --git a/vllm/envs.py b/vllm/envs.py
index 5b4a2010d12e..9d21d8bcea70 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -40,7 +40,6 @@
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
-    VLLM_INSTALL_PUNICA_KERNELS: bool = False
     VLLM_NO_DEPRECATION_WARNING: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
@@ -74,10 +73,6 @@
     "VLLM_USE_PRECOMPILED":
     lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
 
-    # If set, vllm will install Punica kernels
-    "VLLM_INSTALL_PUNICA_KERNELS":
-    lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
-
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
     # Available options: "Debug", "Release", "RelWithDebInfo"

From d1ef5a083d3ebbae8eb4cde504fdeb7807beaece Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 16 Jul 2024 10:08:34 +0800
Subject: [PATCH 53/71] add libentry

---
 vllm/lora/ops/libentry.py          | 97 ++++++++++++++++++++++++++++++
 vllm/lora/ops/sgmv_expand.py       |  3 +
 vllm/lora/ops/sgmv_expand_slice.py |  3 +
 vllm/lora/ops/sgmv_shrink.py       |  3 +
 4 files changed, 106 insertions(+)
 create mode 100644 vllm/lora/ops/libentry.py

diff --git a/vllm/lora/ops/libentry.py b/vllm/lora/ops/libentry.py
new file mode 100644
index 000000000000..a0bc5de9cb07
--- /dev/null
+++ b/vllm/lora/ops/libentry.py
@@ -0,0 +1,97 @@
+# Copied From https://github.com/FlagOpen/FlagGems
+
+import triton
+
+
+
+class LibEntry(triton.KernelInterface):
+    def __init__(
+        self,
+        fn,
+    ):
+        self.fn = fn
+        self.arg_names = fn.arg_names
+        self.divisibility = 16
+        self.config_cache = dict()
+        self.kernel_cache = dict()
+        if isinstance(fn, triton.runtime.Autotuner):
+            self.rt = "Autotuner"
+        elif isinstance(fn, triton.runtime.Heuristics):
+            self.rt = "Heuristics"
+        else:
+            self.rt = "JitFunction"
+
+    def run(self, *args, **kwargs):
+        key = []
+        for arg in args:
+            if hasattr(arg, "data_ptr"):
+                key.append(arg.dtype)
+                key.append(arg.data_ptr() % self.divisibility == 0)
+            elif isinstance(arg, int):
+                key.append(arg)
+        entry_key = tuple(key)
+
+        config = {}
+        # Autotuner
+        if self.rt == "Autotuner":
+            if entry_key not in self.config_cache:
+                # tune
+                kernel = self.fn.run(*args, **kwargs)
+                config = self.fn.best_config.kwargs
+                self.config_cache[entry_key] = config
+                self.kernel_cache[entry_key] = kernel
+                return
+            else:
+                # tuned
+                config = self.config_cache[entry_key]
+                kernel = self.kernel_cache[entry_key]
+        # Heuristics
+        elif self.rt == "Heuristics":
+            if entry_key not in self.kernel_cache:
+                # compile
+                kernel = self.fn.run(*args, **kwargs)
+                self.kernel_cache[entry_key] = kernel
+                return
+            else:
+                # compiled
+                for v, heur in self.fn.values.items():
+                    config[v] = heur(
+                        {**dict(zip(self.arg_names, args)), **kwargs}
+                    )
+                kernel = self.kernel_cache[entry_key]
+        # JitFunction
+        else:
+            if entry_key not in self.kernel_cache:
+                # compile
+                kernel = self.fn.run(*args, **kwargs)
+                self.kernel_cache[entry_key] = kernel
+                return
+            else:
+                # compiled
+                args = tuple([
+                    arg
+                    for i, arg in enumerate(args)
+                    if not self.fn.params[i].is_constexpr
+                ])
+                kernel = self.kernel_cache[entry_key]
+        grid = kwargs["grid"]
+        if callable(grid):
+            # grid_fn
+            current = dict(**kwargs, **config)
+            meta = {**dict(zip(self.arg_names, args)), **current}
+            grid = grid(meta)
+        grid = grid + (1, 1)
+
+        kernel[grid[0:3]](*args)
+        return
+
+
+def libentry():
+    """
+    Decorator for triton library entries.
+    """
+
+    def decorator(fn):
+        return LibEntry(fn)
+
+    return decorator
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 2873882bc263..27e91f5d1e4e 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -9,7 +9,10 @@
 import triton
 import triton.language as tl
 
+from .libentry import libentry
 
+
+@libentry()
 @triton.jit
 def _sgmv_expand_kernel(
     input_ptr,
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 2078a47d7e8e..2906500e7873 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -9,7 +9,10 @@
 import triton
 import triton.language as tl
 
+from .libentry import libentry
 
+
+@libentry()
 @triton.jit
 def _sgmv_expand_slice_kernel(
     input_ptr,
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 094bc62d9da4..c5bc1c08364c 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -9,7 +9,10 @@
 import triton
 import triton.language as tl
 
+from .libentry import libentry
 
+
+@libentry()
 @triton.jit
 def _sgmv_shrink_kernel(
     input_ptr,

From b19ee954ca2a57312c3fbb85c5af79025537ed52 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 16 Jul 2024 10:14:18 +0800
Subject: [PATCH 54/71] format

---
 vllm/lora/ops/libentry.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/lora/ops/libentry.py b/vllm/lora/ops/libentry.py
index a0bc5de9cb07..09790dc38c3c 100644
--- a/vllm/lora/ops/libentry.py
+++ b/vllm/lora/ops/libentry.py
@@ -3,8 +3,8 @@
 import triton
 
 
-
 class LibEntry(triton.KernelInterface):
+
     def __init__(
         self,
         fn,
@@ -55,9 +55,10 @@ def run(self, *args, **kwargs):
             else:
                 # compiled
                 for v, heur in self.fn.values.items():
-                    config[v] = heur(
-                        {**dict(zip(self.arg_names, args)), **kwargs}
-                    )
+                    config[v] = heur({
+                        **dict(zip(self.arg_names, args)),
+                        **kwargs
+                    })
                 kernel = self.kernel_cache[entry_key]
         # JitFunction
         else:
@@ -69,8 +70,7 @@ def run(self, *args, **kwargs):
             else:
                 # compiled
                 args = tuple([
-                    arg
-                    for i, arg in enumerate(args)
+                    arg for i, arg in enumerate(args)
                     if not self.fn.params[i].is_constexpr
                 ])
                 kernel = self.kernel_cache[entry_key]

From 68622d1b1814b074edd3e90497249a681d907f87 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 16 Jul 2024 17:28:19 +0800
Subject: [PATCH 55/71] optimize no lora step

---
 vllm/lora/ops/bgmv_expand.py       |  5 ++--
 vllm/lora/ops/bgmv_expand_slice.py |  8 +++----
 vllm/lora/ops/bgmv_shrink.py       |  5 ++--
 vllm/lora/ops/sgmv_expand.py       |  3 ++-
 vllm/lora/ops/sgmv_expand_slice.py |  5 ++--
 vllm/lora/ops/sgmv_shrink.py       |  3 ++-
 vllm/lora/punica.py                | 38 ++++++++++++++++++++----------
 7 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 576559beeffe..91251fa0510d 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -37,7 +37,7 @@ def _bgmv_expand_kernel(
     CAST_TYPE: tl.constexpr,
 ):
     """
-    GroupGEMV,Additionally, introducing SPLIT_N can improve large hidden_size's
+    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
     performance
     """
     pid_sn = tl.program_id(axis=0)
@@ -101,7 +101,8 @@ def bgmv_expand(
         lora_b_weights (torch.Tensor): lora'a weight
         output_tensor (torch.Tensor): output tensor
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
+            corresponding to each batch, An index of -1 means no lora should be
+            applied.
         batches (int): batch size
         add_inputs (bool, optional):  Defaults to False. adds the final lora 
             results to the output.
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 24f2b93f4bf2..31b2cd545d3d 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -38,7 +38,7 @@ def _bgmv_expand_slice_kernel(
     CAST_TYPE: tl.constexpr,
 ):
     """
-    GroupGEMV,Additionally, introducing SPLIT_N can improve large hidden_size's
+    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
     performance
     """
     pid_sn = tl.program_id(axis=0)
@@ -105,7 +105,8 @@ def bgmv_expand_slice(
         lora_b_weights (torch.Tensor): lora'b weight
         output_tensor (torch.Tensor): output tensor
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
+            corresponding to each batch, An index of -1 means no lora should be
+            applied.
         slice_offst (int): output_tensor's offst
         slice_size (int): current output_tensor's size
         batches (int): batch size
@@ -136,10 +137,7 @@ def bgmv_expand_slice(
     # TODO tuning this config
 
     N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    # BLOCK_N = 256
     BLOCK_K = triton.next_power_of_2(K)
-
-    # SPLIT_N = 64
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index 6e3d90e2d235..1d8d23674d02 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -35,7 +35,7 @@ def _bgmv_shrink_kernel(
     SPLIT_K: tl.constexpr,
 ):
     """
-    GroupGEMV,Additionally, introducing SPLIT-K can improve large hidden_size's
+    GroupGEMV, additionally, introducing SPLIT-K can improve large hidden_size's
     performance
     """
     pid_sk = tl.program_id(axis=0)
@@ -93,7 +93,8 @@ def bgmv_shrink(
         lora_a_weights (torch.Tensor): lora'a weight
         output_tensor (torch.Tensor): output tensor
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
         batches (int): batch size
         scaling (float):  Scaling factor.
         override_config (Optional[Dict[str, int]], optional): Defaults to None. 
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 27e91f5d1e4e..7e82533e4a1f 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -121,7 +121,8 @@ def sgmv_expand(
         seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
             length of the sequences  in the batch
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
         batches (int): batch size
         max_seq_length (int):  The max sequence lengths of the sequences
             in the batch
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 2906500e7873..261e562683d3 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -42,7 +42,7 @@ def _sgmv_expand_slice_kernel(
 
     Similar to the 'sgmv_expand' operator, but with an added parameter 
     'slice_offset'. The reason for not reusing the 'sgmv_expand' operator 
-    might be that in the future, we could implement  a fusion  operator to 
+    might be that in the future, we could implement a fusion operator to 
     achieve the current functionality instead of having to call it multiple 
     times.
     """
@@ -130,7 +130,8 @@ def sgmv_expand_slice(
         seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
             length of the sequences  in the batch
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
         batches (int): batch size
         max_seq_length (int):  The max sequence lengths of the sequences
             in the batch
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index c5bc1c08364c..670117cee000 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -125,7 +125,8 @@ def sgmv_shrink(
         seq_len_tensor (torch.Tensor): (batch_size,). record the sequence
             length of the sequences  in the batch
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch
+            corresponding to each batch. An index of -1 means no lora should be
+            applied.
         batches (int): batch size
         max_seq_length (int):  The max sequence lengths of the sequences
             in the batch
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 16d41cfa11ff..19b523002384 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -24,7 +24,7 @@
 
 def compute_meta(
     token_lora_tensor: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int]:
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, bool]:
     """
     Get the information required for the sgmv kernel. With the  features:
     1. If consecutive requests in the batch use the same LoRA, this function
@@ -40,14 +40,16 @@ def compute_meta(
     b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
     b_seq_start_tensor[1:].copy_(cum_result[:-1])
     max_length = seq_length_tensor.max().item()
+
     batch_size = lora_indices_tensor.size(0)
-    return (
-        b_seq_start_tensor,
-        seq_length_tensor,
-        lora_indices_tensor,
-        batch_size,
-        max_length,
-    )
+    no_lora = False
+    # -1 means no lora should be applied. Use `no_lora` to determine whether
+    # the current step requires LoRA. If LoRA is not needed, the prefill stage
+    # does not need to launch the triton kernel, which can improve performance
+    if batch_size == 1 and lora_indices_tensor == -1:
+        no_lora = True
+    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+            batch_size, max_length, no_lora)
 
 
 # TODO see if this can be vectorized
@@ -174,7 +176,7 @@ class PunicaWrapper:
     """
     PunicaWrapper is designed to manage and provide metadata for the punica 
     kernel. The main function  is to maintain the state information for 
-    Multi-LoRA, and to provide the interface for the punica operator.
+    Multi-LoRA, and to provide the interface for the punica kernel.
     """
 
     def __init__(self, max_num_batched_tokens: int, max_batches: int,
@@ -213,6 +215,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         self.max_length: int = 0
         self.batch_size: int = -1
         self.is_prefill = False
+        self.no_lora = False
 
     def update_metadata(
         self,
@@ -276,7 +279,7 @@ def _update_base_metadata(
     def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
 
         (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
-         batch_size, max_length) = compute_meta(token_lora_tensor)
+         batch_size, max_length, no_lora) = compute_meta(token_lora_tensor)
 
         self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
             b_seq_start_tensor)
@@ -285,6 +288,7 @@ def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
             lora_indices_tensor)
         self.batch_size = batch_size
         self.max_length = max_length
+        self.no_lora = no_lora
 
     @property
     def prefill_metadata(
@@ -294,7 +298,8 @@ def prefill_metadata(
         metadata for prefill-related  kernel computations.
             1. seq_start_locs: Tensor of sequence start positions
             2. seq_lengths: Tensor of sequence lengths
-            3. lora_indices_per_batch: Tensor of lora indices
+            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
+                -1 means no lora should be applied.
             4. batch_size: batch size after clustering identical lora indices
             5. max_length: The maximum sequence length in the batch
         """
@@ -307,7 +312,7 @@ def prefill_metadata(
     def token_lora_indices(self) -> torch.Tensor:
         """
         This property provides the lora indices corresponding to each token 
-        in the batch
+        in the batch. An index of -1 means no lora should be applied.
         """
         token_lora_len = self.indices_len[0]
         return self._token_lora_indices[:token_lora_len]
@@ -354,6 +359,9 @@ def shrink_prefill(
         w_t_all: torch.Tensor,
         scale: float,
     ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
         sgmv_shrink(
             x,
             w_t_all,
@@ -378,6 +386,9 @@ def expand_prefill(
         w_t_all: torch.Tensor,
         add_input: bool,
     ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
         sgmv_expand(
             x,
             w_t_all,
@@ -404,6 +415,9 @@ def expand_slice_prefill(
         y_slice_size: Optional[int],
         add_input: bool,
     ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
         sgmv_expand_slice(
             x,
             w_t_all,

From e7b4a4e3a60cb957089209ab83509d1a08de64cc Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 17 Jul 2024 11:20:03 +0800
Subject: [PATCH 56/71] move libentry location

---
 vllm/lora/ops/sgmv_expand.py                | 2 +-
 vllm/lora/ops/sgmv_expand_slice.py          | 2 +-
 vllm/lora/ops/sgmv_shrink.py                | 2 +-
 vllm/lora/punica.py                         | 2 +-
 vllm/triton_utils/__init__.py               | 5 ++---
 vllm/{lora/ops => triton_utils}/libentry.py | 0
 6 files changed, 6 insertions(+), 7 deletions(-)
 rename vllm/{lora/ops => triton_utils}/libentry.py (100%)

diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 7e82533e4a1f..f4edde95345e 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -9,7 +9,7 @@
 import triton
 import triton.language as tl
 
-from .libentry import libentry
+from vllm.triton_utils import libentry
 
 
 @libentry()
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 261e562683d3..16181f3f7b74 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -9,7 +9,7 @@
 import triton
 import triton.language as tl
 
-from .libentry import libentry
+from vllm.triton_utils import libentry
 
 
 @libentry()
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 670117cee000..8ab049989abe 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -9,7 +9,7 @@
 import triton
 import triton.language as tl
 
-from .libentry import libentry
+from vllm.triton_utils import libentry
 
 
 @libentry()
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 19b523002384..ac1392518590 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -17,7 +17,7 @@
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
 if TYPE_CHECKING:
-    # avodi circuit import
+    # avoid circuit import
     from vllm.lora.layers import LoRAMapping
     from vllm.lora.models import LongContextLoRAContext
 
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 09843e5d1f30..e42f41a6779f 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,6 +1,5 @@
 from vllm.triton_utils.custom_cache_manager import (
     maybe_set_triton_cache_manager)
+from vllm.triton_utils.libentry import libentry
 
-__all__ = [
-    "maybe_set_triton_cache_manager",
-]
+__all__ = ["maybe_set_triton_cache_manager", "libentry"]
diff --git a/vllm/lora/ops/libentry.py b/vllm/triton_utils/libentry.py
similarity index 100%
rename from vllm/lora/ops/libentry.py
rename to vllm/triton_utils/libentry.py

From 008a9d7e1d656ccab9a1b31f0b265f3215532343 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Wed, 17 Jul 2024 15:47:44 +0800
Subject: [PATCH 57/71] test gemma lora

---
 tests/lora/test_gemma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 709246179bfe..478bb86b7861 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -37,7 +37,7 @@ def test_gemma_lora(gemma_lora_files):
     expected_lora_output = [
         "more important than knowledge.\nAuthor: Albert Einstein\n",
         "everyone else is already taken.\nAuthor: Oscar Wilde\n",
-        "so little time\nAuthor: Frank Zappa\n",
+        "so little time.\nAuthor: Frank Zappa\n",
     ]
 
     output1 = do_sample(llm, gemma_lora_files, lora_id=1)

From 5e112090ee972bff21e31ae711e366763a5c7f18 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Thu, 18 Jul 2024 14:11:08 +0800
Subject: [PATCH 58/71] cleanup code

---
 vllm/_custom_ops.py               | 42 +------------------------------
 vllm/lora/fully_sharded_layers.py |  2 --
 vllm/lora/models.py               |  1 +
 vllm/triton_utils/libentry.py     |  9 ++++++-
 4 files changed, 10 insertions(+), 44 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 4ca67224a91b..0130d3424c13 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -13,12 +13,9 @@
 except ImportError as e:
     logger.warning("Failed to import from vllm._C with %r", e)
 
-with contextlib.suppress(ImportError):
-    import vllm._moe_C
-
 with contextlib.suppress(ImportError):
     # ruff: noqa: F401
-    import vllm._punica_C
+    import vllm._moe_C
 
 
 def is_custom_op_supported(op_name: str) -> bool:
@@ -471,43 +468,6 @@ def register_graph_buffers(fa: int, handles: List[str],
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
 
-# punica
-def dispatch_bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-) -> None:
-    torch.ops._punica_C.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx,
-                                      scale)
-
-
-def dispatch_bgmv_low_level(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-    h_in: int,
-    h_out: int,
-    y_offset: int,
-) -> None:
-    torch.ops._punica_C.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        h_in,
-        h_out,
-        y_offset,
-    )
-
-
 # temporary fix for https://github.com/vllm-project/vllm/issues/5456
 # TODO: remove this in v0.6.0
 names_and_values = globals()
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index cae7d593f123..f751434bb7b4 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -15,8 +15,6 @@
                               QKVParallelLinearWithLora,
                               RowParallelLinearWithLoRA)
 
-# from vllm.lora.punica import add_expand, add_expand_slice, add_shrink
-
 if TYPE_CHECKING:
     pass
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index bbb2fca6e804..9a9b4766cf41 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -461,6 +461,7 @@ def _create_lora_modules(self):
                                                 self.model.config))
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
+            # All lora layers share the same punica_wrapper based on reference.
             new_module.set_mapping(self.punica_wrapper)
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
index 09790dc38c3c..2a981663bebf 100644
--- a/vllm/triton_utils/libentry.py
+++ b/vllm/triton_utils/libentry.py
@@ -80,7 +80,10 @@ def run(self, *args, **kwargs):
             current = dict(**kwargs, **config)
             meta = {**dict(zip(self.arg_names, args)), **current}
             grid = grid(meta)
-        grid = grid + (1, 1)
+        if isinstance(grid, list):
+            grid = grid + [1, 1]
+        elif isinstance(grid, list):
+            grid = grid + (1, 1)
 
         kernel[grid[0:3]](*args)
         return
@@ -89,6 +92,10 @@ def run(self, *args, **kwargs):
 def libentry():
     """
     Decorator for triton library entries.
+    Motivation:
+        The runtime overhead of Triton kernels is the reason for the lower 
+        performance of small kernels, particularly evident with smaller models. 
+        Using this decorator can reduce Triton runtime overhead.
     """
 
     def decorator(fn):

From 0c010fdae976fb4dc82fb65fb59d5a4f2cf3135e Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 19 Jul 2024 19:25:15 +0800
Subject: [PATCH 59/71] Verify libentry decorator for punica and sample kernels

---
 tests/kernels/test_sampler.py    | 55 +++++++++++++--------
 tests/lora/test_triton_punica.py | 83 +++++++++++++++++++++++---------
 2 files changed, 96 insertions(+), 42 deletions(-)

diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index e28f809309ec..34104c1818c7 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -1,4 +1,5 @@
 import gc
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -6,10 +7,11 @@
 import triton.language as tl
 
 from vllm.model_executor.layers.ops.sample import (
-    MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
-    sample)
+    MAX_TRITON_N_COLS, _sample_triton, _uniform_to_exponential,
+    get_num_triton_sampler_splits, sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
+from vllm.triton_utils.libentry import LibEntry
 
 SINGLE_SPLIT_VOCAB_SIZE = 32000  # llama/mistral/mixtral vocab size
 MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
@@ -75,15 +77,20 @@ def test_sample_decoding_only(random_sampling, max_best_of,
     seeds = torch.randint(1,
                           torch.iinfo(torch.long).max, (n_splits, bs),
                           device="cuda").mul_(random_sampling_mask)
-    sampled_tokens, sampled_logprobs, sampled_modified_probs = sample(
-        probs=probs,
-        logprobs=logprobs,
-        sample_indices=sample_indices,
-        seeds=seeds,
-        max_best_of=max_best_of,
-        modify_greedy_probs=modify_greedy_probs,
-        save_logprobs=save_logprobs,
-        _save_modified_probs=True)
+    #The current _sample_triton does not utilize the
+    # libentry decoration. The purpose of adding this patch is to test
+    # the correctness of libentry.
+    with patch("vllm.model_executor.layers.ops.sample._sample_triton",
+               LibEntry(_sample_triton)):
+        sampled_tokens, sampled_logprobs, sampled_modified_probs = sample(
+            probs=probs,
+            logprobs=logprobs,
+            sample_indices=sample_indices,
+            seeds=seeds,
+            max_best_of=max_best_of,
+            modify_greedy_probs=modify_greedy_probs,
+            save_logprobs=save_logprobs,
+            _save_modified_probs=True)
     assert sampled_tokens.shape == (bs, max_best_of)
     for i in range(bs):
         assert torch.all(sampled_tokens[i] == i * (vocab_size // bs))
@@ -129,6 +136,7 @@ def test_sample_decoding_only(random_sampling, max_best_of,
                          [SINGLE_SPLIT_VOCAB_SIZE, MULTI_SPLIT_VOCAB_SIZE])
 def test_sample_prompt_logprobs(random_sampling, max_best_of,
                                 modify_greedy_probs, seed, vocab_size):
+
     set_random_seed(seed)
     prompt_sizes = [16, 32, 64, 128] * 2
     samples = 8
@@ -156,14 +164,17 @@ def test_sample_prompt_logprobs(random_sampling, max_best_of,
     seeds = torch.randint(1,
                           torch.iinfo(torch.long).max, (n_splits, samples),
                           device="cuda").mul_(random_sampling_mask)
-    sampled_tokens, sampled_logprobs, _ = sample(
-        probs=probs,
-        logprobs=logprobs,
-        sample_indices=sample_indices,
-        seeds=seeds,
-        max_best_of=max_best_of,
-        modify_greedy_probs=modify_greedy_probs,
-        save_logprobs=True)
+    #ditto
+    with patch("vllm.model_executor.layers.ops.sample._sample_triton",
+               LibEntry(_sample_triton)):
+        sampled_tokens, sampled_logprobs, _ = sample(
+            probs=probs,
+            logprobs=logprobs,
+            sample_indices=sample_indices,
+            seeds=seeds,
+            max_best_of=max_best_of,
+            modify_greedy_probs=modify_greedy_probs,
+            save_logprobs=True)
     assert sampled_tokens.shape == (samples, max_best_of)
     assert sampled_logprobs.shape == (samples, max_best_of)
     for i, t in enumerate(sample_indices):
@@ -194,3 +205,9 @@ def test_get_sequence_seeds(seed):
         assert new_seq_seed_extra_entropy != new_seq_seed
         assert seq_seed != new_seq_seed
         seq_seed = new_seq_seed
+
+
+if __name__ == "__main__":
+    pytest.main([
+        "/home/sobey/Code/Code_leejee/vllm_main/vllm/tests/kernels/test_sampler.py::test_sample_prompt_logprobs"
+    ])
diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index 3ed2f032241e..de935818c1c5 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -1,4 +1,5 @@
 import random
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -9,6 +10,7 @@
 from vllm.lora.ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.triton_utils.libentry import LibEntry
 
 HIDDEN_SIZES = [
     128,
@@ -323,6 +325,8 @@ def test_punica_bgmv(
     seed: int,
     device: str,
 ):
+    from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
+    from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
     random.seed(seed)
     torch.set_default_device(device)
     torch.random.manual_seed(seed)
@@ -346,21 +350,29 @@ def test_punica_bgmv(
     ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length,
                        dtype, op_type, device)
     if op_type == "shrink":
-        bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            scaling,
-        )
+        #The current _bgmv_shrink_kernel does not require the libentry
+        # decoration. The purpose of adding this patch is to test the
+        # correctness of libentry.
+        with patch("vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
+                   LibEntry(_bgmv_shrink_kernel)):
+            bgmv_shrink(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                scaling,
+            )
     else:
-        bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            add_inputs=True,
-        )
+        #ditto
+        with patch("vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
+                   LibEntry(_bgmv_expand_kernel)):
+            bgmv_expand(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                add_inputs=True,
+            )
     _torch_groupgemm(
         ref_out_tensor,
         inputs_tensor,
@@ -394,6 +406,7 @@ def test_punica_expand_nslices(
     seed: int,
     device: str,
 ):
+    from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
     random.seed(seed)
     torch.set_default_device(device)
     torch.random.manual_seed(seed)
@@ -446,15 +459,21 @@ def test_punica_expand_nslices(
                 add_inputs=True,
             )
         else:
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
+            #The current _bgmv_expand_slice_kernel does not require the
+            # libentry decoration. The purpose of adding this patch is to test
+            # the correctness of libentry.
+            with patch(
+                    "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
+                    LibEntry(_bgmv_expand_slice_kernel)):
+                bgmv_expand_slice(
+                    inputs_tensor,
+                    lora_weights,
+                    our_outputs,
+                    indices,
+                    slice_offset,
+                    slice_size=hidden_size,
+                    add_inputs=True,
+                )
         _torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
@@ -468,3 +487,21 @@ def test_punica_expand_nslices(
 
         slice_offset += hidden_size
     assert_close(our_outputs, ref_outputs)
+
+
+if __name__ == "__main__":
+    from itertools import product
+    lst = list(
+        product(
+            BATCHES,
+            NUM_LORA,
+            MAX_RANKS,
+            [1.0],
+            [torch.float16],
+            ["shrink"],
+            SEED,
+            CUDA_DEVICES,
+        ))
+    for ele in lst:
+        test_punica_bgmv(*ele)
+        print(f"{ele},pass")

From 1a23abc214cb9b298e70681d24fc9009b838d5ff Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 22 Jul 2024 14:06:48 +0800
Subject: [PATCH 60/71] clean up code

---
 tests/kernels/test_sampler.py | 6 ------
 vllm/triton_utils/libentry.py | 3 +--
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index 34104c1818c7..8d6a622a5071 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -205,9 +205,3 @@ def test_get_sequence_seeds(seed):
         assert new_seq_seed_extra_entropy != new_seq_seed
         assert seq_seed != new_seq_seed
         seq_seed = new_seq_seed
-
-
-if __name__ == "__main__":
-    pytest.main([
-        "/home/sobey/Code/Code_leejee/vllm_main/vllm/tests/kernels/test_sampler.py::test_sample_prompt_logprobs"
-    ])
diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
index d2d1ce969237..4bdbc4efd979 100644
--- a/vllm/triton_utils/libentry.py
+++ b/vllm/triton_utils/libentry.py
@@ -44,7 +44,6 @@ def key(self, spec_args, dns_args, const_args):
 
     def run(self, *args, **kwargs):
         grid = kwargs["grid"]
-
         # collect all the arguments
         spec_args = []  # specialize arguments
         dns_args = []  # do not specialize arguments
@@ -118,7 +117,7 @@ def run(self, *args, **kwargs):
             # captured args have higher priority
             filterd_constexprs = {
                 k: v
-                for k, v in constexprs.items() if not isinstance(v, type)
+                for k, v in constexprs.items() if v is not inspect._empty
             }
             meta = {
                 **dict(zip(self.arg_names, args)),

From c1a0cd50ece137314f5ea5e8968331c29cd856cd Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 22 Jul 2024 17:05:13 +0800
Subject: [PATCH 61/71] modify libentry code

---
 vllm/triton_utils/libentry.py | 57 +++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
index 4bdbc4efd979..d247d0a5adad 100644
--- a/vllm/triton_utils/libentry.py
+++ b/vllm/triton_utils/libentry.py
@@ -78,7 +78,7 @@ def run(self, *args, **kwargs):
         entry_key = self.key(spec_args, dns_args, const_args)
 
         if entry_key not in self.kernel_cache:
-            # compile kernel
+            # compiling the kernel also completes the related computations
             kernel = self.fn.run(*args, **kwargs)
             fn = self.fn
             # collect constexpr arguments for grid computation
@@ -100,38 +100,43 @@ def run(self, *args, **kwargs):
                 else:
                     raise RuntimeError("Invalid Runtime Function")
                 fn = fn.fn
+            # In vLLM, certain kernels like fused_moe_kernel get the
+            # best_config(as kwargs) from a configuration json file, rather
+            # than using Autotuner & Heuristics. Therefore, all their constexprs
+            # (tl.constexpr) are assigned values through the following loop.
             for p in self.jit_function.params:
                 if p.is_constexpr and p.name not in constexprs:
-                    constexprs[p.name] = p.default
+                    constexprs[p.name] = p.default  #default=inspect._empty
             self.kernel_cache[entry_key] = (kernel, constexprs)
-            return
         else:
             kernel, constexprs = self.kernel_cache[entry_key]
 
-        if callable(grid):
-            # collect all arguments to the grid fn，ie:
-            # 1. args,
-            # 2. kwargs,
-            # 3. all all other captured arguments in CompiledKernel from
-            # Autotunner & Heuristics when kwargs & captured args conflict,
-            # captured args have higher priority
-            filterd_constexprs = {
-                k: v
-                for k, v in constexprs.items() if v is not inspect._empty
-            }
-            meta = {
-                **dict(zip(self.arg_names, args)),
-                **kwargs,
-                **filterd_constexprs,
-            }
+            if callable(grid):
+                # collect all arguments to the grid fn，ie:
+                # 1. args,
+                # 2. kwargs,
+                # 3. all all other captured arguments in CompiledKernel from
+                # Autotunner & Heuristics when kwargs & captured args conflict,
+                # captured args have higher priority
+                # 4. We must filter out captured args with default value firstly
+                constexprs = {
+                    k: v
+                    for k, v in constexprs.items() if v is not inspect._empty
+                }
+
+                meta = {
+                    **dict(zip(self.arg_names, args)),
+                    **kwargs,
+                    **constexprs,
+                }
             grid = grid(meta)
-        if isinstance(grid, tuple):
-            grid = grid + (1, 1)
-        elif isinstance(grid, list):
-            grid = grid + [1, 1]
-
-        kernel[grid[0:3]](*k_args)
-        return
+            if isinstance(grid, tuple):
+                grid = grid + (1, 1)
+            elif isinstance(grid, list):
+                grid = grid + [1, 1]
+            kernel[grid[0:3]](*k_args)
+        # maintaining the same return  type as the JITFunction.run
+        return kernel
 
 
 def libentry():

From 4513dcf4b06b55911b3c58fc9c2b439950e1b872 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 22 Jul 2024 17:29:15 +0800
Subject: [PATCH 62/71] fix bug

---
 vllm/triton_utils/libentry.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
index d247d0a5adad..85786fae334a 100644
--- a/vllm/triton_utils/libentry.py
+++ b/vllm/triton_utils/libentry.py
@@ -123,13 +123,12 @@ def run(self, *args, **kwargs):
                     k: v
                     for k, v in constexprs.items() if v is not inspect._empty
                 }
-
                 meta = {
                     **dict(zip(self.arg_names, args)),
                     **kwargs,
                     **constexprs,
                 }
-            grid = grid(meta)
+                grid = grid(meta)
             if isinstance(grid, tuple):
                 grid = grid + (1, 1)
             elif isinstance(grid, list):

From c876e39e2cdd1a23209bb8ab188cdc2b02fe52bc Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Mon, 22 Jul 2024 23:31:19 +0800
Subject: [PATCH 63/71] modify libentry code and cleanup code

---
 tests/lora/test_quant_model.py     | 48 +++++++++++++++++-------------
 vllm/lora/ops/bgmv_expand.py       |  1 -
 vllm/lora/ops/bgmv_shrink.py       |  1 -
 vllm/lora/ops/sgmv_expand.py       |  1 -
 vllm/lora/ops/sgmv_expand_slice.py |  1 -
 vllm/triton_utils/libentry.py      | 12 +++++++-
 6 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 8fd968c69e58..2c78fbae397c 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -64,14 +64,16 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
     # if torch.cuda.device_count() < tp_size:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    llm = vllm.LLM(model=model.model_path,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   max_model_len=400,
-                   tensor_parallel_size=tp_size,
-                   quantization=model.quantization,
-                   trust_remote_code=True)
+    llm = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_model_len=400,
+        tensor_parallel_size=tp_size,
+        gpu_memory_utilization=0.4,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True)
 
     if model.quantization is None:
         expected_no_lora_output = [
@@ -156,24 +158,28 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
     # if torch.cuda.device_count() < 2:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
 
-    llm_tp1 = vllm.LLM(model=model.model_path,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=1,
-                       quantization=model.quantization,
-                       trust_remote_code=True)
+    llm_tp1 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.4,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True)
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
 
     del llm_tp1
     cleanup()
 
-    llm_tp2 = vllm.LLM(model=model.model_path,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=2,
-                       quantization=model.quantization)
+    llm_tp2 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=2,
+        gpu_memory_utilization=0.4,  #avoid OOM
+        quantization=model.quantization)
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 
     del llm_tp2
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 91251fa0510d..2d09c7cfe6c8 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -66,7 +66,6 @@ def _bgmv_expand_kernel(
     c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
     for n in range(0, split_n_length, BLOCK_N):
         current_n = n + offset_n
-        # vector load
         current_n_c = tl.max_contiguous(current_n, BLOCK_N)
         b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
                                                               < K)
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py
index 1d8d23674d02..e69d33078f5a 100644
--- a/vllm/lora/ops/bgmv_shrink.py
+++ b/vllm/lora/ops/bgmv_shrink.py
@@ -51,7 +51,6 @@ def _bgmv_shrink_kernel(
     accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32)
     for k in range(0, K, BLOCK_K * SPLIT_K):
         current_k = k + offset_k
-        # vector load
         current_k_c = tl.max_contiguous(current_k, BLOCK_K)
         tiled_a = tl.load(
             a_ptr + current_k_c,
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index f4edde95345e..459049546909 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -76,7 +76,6 @@ def _sgmv_expand_kernel(
                               other=0)
         if CAST_TYPE:
             tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-        #TODO Can I use D=A@B+C ?
         accumulator += tl.dot(
             tiled_a,
             tiled_b,
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index 16181f3f7b74..ff3bcda071b8 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -82,7 +82,6 @@ def _sgmv_expand_slice_kernel(
                               other=0)
         if CAST_TYPE:
             tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-        # TODO Can I use D=A@B+C ?
         accumulator += tl.dot(
             tiled_a,
             tiled_b,
diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
index 85786fae334a..9e3774d98a54 100644
--- a/vllm/triton_utils/libentry.py
+++ b/vllm/triton_utils/libentry.py
@@ -78,7 +78,7 @@ def run(self, *args, **kwargs):
         entry_key = self.key(spec_args, dns_args, const_args)
 
         if entry_key not in self.kernel_cache:
-            # compiling the kernel also completes the related computations
+            # compile the kernel also completes the related computations
             kernel = self.fn.run(*args, **kwargs)
             fn = self.fn
             # collect constexpr arguments for grid computation
@@ -109,6 +109,7 @@ def run(self, *args, **kwargs):
                     constexprs[p.name] = p.default  #default=inspect._empty
             self.kernel_cache[entry_key] = (kernel, constexprs)
         else:
+            # load kernel from cache directly
             kernel, constexprs = self.kernel_cache[entry_key]
 
             if callable(grid):
@@ -145,6 +146,15 @@ def libentry():
         The runtime overhead of Triton kernels is the reason for the lower 
         performance of small kernels, particularly evident with smaller models. 
         Using this decorator can reduce Triton runtime overhead.
+    How:
+        The `run` function of JITFunction needs to accomplish:
+            - Parameter binding using inspect
+            - KernelArg type wrapping
+            - Cache key calculation
+        When dealing with small size, these steps can become bottlenecks in 
+        Triton runtime. Libentry simplifies these steps to reduce runtime 
+        overhead, thereby improving the runtime expenses of small kernels.
+
     """
 
     def decorator(fn):

From b02bce3aae76d50b60f43909e55061db27a857c5 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 23 Jul 2024 07:45:29 +0800
Subject: [PATCH 64/71] add a comment to libentry code

---
 vllm/triton_utils/libentry.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
index 9e3774d98a54..2654faa526db 100644
--- a/vllm/triton_utils/libentry.py
+++ b/vllm/triton_utils/libentry.py
@@ -154,6 +154,10 @@ def libentry():
         When dealing with small size, these steps can become bottlenecks in 
         Triton runtime. Libentry simplifies these steps to reduce runtime 
         overhead, thereby improving the runtime expenses of small kernels.
+    NOTE:
+        When Triton is upgraded to version 3.0.0, libentry can be removed,
+        see: https://github.com/vllm-project/vllm/pull/5036#issuecomment-2243396245
+        
 
     """
 

From 89e96eb7676c4556be58abb461462e7f70a2afc5 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Tue, 23 Jul 2024 14:30:43 +0800
Subject: [PATCH 65/71] test lora CI

---
 tests/lora/test_quant_model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 2c78fbae397c..2370c693e953 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -71,7 +71,7 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
         max_loras=4,
         max_model_len=400,
         tensor_parallel_size=tp_size,
-        gpu_memory_utilization=0.4,  #avoid OOM
+        gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
         trust_remote_code=True)
 
@@ -164,7 +164,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
         max_num_seqs=16,
         max_loras=4,
         tensor_parallel_size=1,
-        gpu_memory_utilization=0.4,  #avoid OOM
+        gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
         trust_remote_code=True)
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
@@ -178,7 +178,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
         max_num_seqs=16,
         max_loras=4,
         tensor_parallel_size=2,
-        gpu_memory_utilization=0.4,  #avoid OOM
+        gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization)
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 

From 1f4a4721569ffd7f6141e7845bbbe5876c148348 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 26 Jul 2024 00:47:26 +0800
Subject: [PATCH 66/71] fix typo

---
 vllm/triton_utils/libentry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
index 2654faa526db..ae00af44a048 100644
--- a/vllm/triton_utils/libentry.py
+++ b/vllm/triton_utils/libentry.py
@@ -135,7 +135,7 @@ def run(self, *args, **kwargs):
             elif isinstance(grid, list):
                 grid = grid + [1, 1]
             kernel[grid[0:3]](*k_args)
-        # maintaining the same return  type as the JITFunction.run
+        # maintaining the same return type as the JITFunction.run
         return kernel
 
 
From 377847ad288a501daa01104a3cef79f54e478d9e Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 26 Jul 2024 00:49:59 +0800
Subject: [PATCH 67/71] modify test

---
 tests/lora/test_triton_punica.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_triton_punica.py
index de935818c1c5..80b5ec017286 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_triton_punica.py
@@ -65,7 +65,7 @@
 ]
 
 BATCHES = [1, 2, 4] + [8 * i for i in range(1, 7)]
-NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
+NUM_LORA = [1, 4, 8, 16]
 DTYPES = [torch.float16, torch.bfloat16]
 MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
 SCALES = [0.5]

From cd1fb05c6fd87e888b7eb6a99c40cd793fb384c7 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 26 Jul 2024 00:59:15 +0800
Subject: [PATCH 68/71] Trigger CI


From 9ac909e9bbe1bbc7a0b00344866575766ff00942 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Fri, 26 Jul 2024 23:28:15 +0800
Subject: [PATCH 69/71] optimize bgmv_exapnd and enhance punica unit test

---
 ..._triton_punica.py => test_punica_sizes.py} | 283 +++++----------
 tests/lora/test_punica_variation.py           | 342 ++++++++++++++++++
 tests/lora/utils.py                           | 148 ++++++++
 vllm/lora/ops/bgmv_expand.py                  |   2 +-
 vllm/lora/ops/bgmv_expand_slice.py            |   2 +-
 vllm/lora/ops/utils.py                        |  18 +-
 6 files changed, 601 insertions(+), 194 deletions(-)
 rename tests/lora/{test_triton_punica.py => test_punica_sizes.py} (53%)
 create mode 100644 tests/lora/test_punica_variation.py

diff --git a/tests/lora/test_triton_punica.py b/tests/lora/test_punica_sizes.py
similarity index 53%
rename from tests/lora/test_triton_punica.py
rename to tests/lora/test_punica_sizes.py
index 80b5ec017286..c052568dc2e3 100644
--- a/tests/lora/test_triton_punica.py
+++ b/tests/lora/test_punica_sizes.py
@@ -1,3 +1,9 @@
+"""
+This script is mainly used to tests various hidden_sizes. We have collected the 
+hidden_sizes included in the LoRA models currently supported by vLLM. It tests
+whether the corresponding Triton kernel can run normally when tensor parallelism
+is set to [1, 2, 4, 8, 16, 32, 64].
+"""
 import random
 from unittest.mock import patch
 
@@ -12,42 +18,67 @@
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 from vllm.triton_utils.libentry import LibEntry
 
+from .utils import (generate_data, generate_data_for_expand_nslices,
+                    ref_torch_groupgemm)
+
 HIDDEN_SIZES = [
     128,
     256,
     512,
+    896,
     1024,
     1152,
+    1216,
     1280,
     1536,
+    1664,
     2048,
+    2240,
     2304,
+    2368,
+    2432,
     2560,
     2752,
     3072,
-    3424,
+    3328,
     3456,
     3584,
+    3712,
     4096,
+    4480,
     4608,
+    4736,
+    4864,
     5120,
     5504,
     5632,
+    5888,
     6144,
+    6400,
     6848,
     6912,
     7168,
+    7424,
     8192,
+    8960,
     9216,
+    9472,
     10240,
     11008,
+    11264,
     13824,
     14336,
+    14784,
+    14848,
     15360,
+    18944,
     22016,
+    22528,
     24576,
     27392,
     27648,
+    29568,
+    29696,
     32000,
     32256,
     32512,
@@ -56,6 +87,9 @@
     36864,
     43264,
     49152,
+    49408,
+    60544,
+    60672,
     64000,
     64256,
     102400,
@@ -63,11 +97,20 @@
     128000,
     128256,
 ]
+#The size of TP
+divisibility = [1, 2, 4, 8, 16, 32, 64]
+
+all_hidden_size = []
+for div in divisibility:
+    for hidden_size in HIDDEN_SIZES:
+        all_hidden_size.append(hidden_size // div)
 
-BATCHES = [1, 2, 4] + [8 * i for i in range(1, 7)]
-NUM_LORA = [1, 4, 8, 16]
+HIDDEN_SIZES = list(set(all_hidden_size))
+
+BATCHES = [4]
+NUM_LORA = [4]
 DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
+MAX_RANKS = [32]
 SCALES = [0.5]
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
@@ -82,150 +125,10 @@ def assert_close(a, b):
     torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
-def _torch_groupgemm(
-    out_tensor,
-    inputs,
-    lora_weights,
-    lora_indices_tensor,
-    seq_len_tensor,
-    batches,
-    scaling,
-    op_type,
-) -> torch.Tensor:
-    out_list = []
-    current_offset = 0
-    for lora_index, b_length in zip(range(batches), seq_len_tensor):
-        input_weight = inputs[current_offset:b_length + current_offset, :]
-        current_offset += b_length
-        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
-        result = torch.nn.functional.linear(input_weight, lora_weight)
-        result *= scaling
-        out_list.append(result)
-    cat_result = torch.cat(out_list, dim=0)
-    if op_type == "expand":
-        out_tensor += cat_result
-    else:
-        out_tensor.copy_(cat_result)
-    return
-
-
-def _generate_data(batches, hidden_size, lora_nums, max_rank, seq_length,
-                   dtype, op_type, device):
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-        dim=0,
-    ).to(device)
-    total_tokens = seq_len_tensor.sum()
-    if op_type == "shrink":
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
-        lora_weights = torch.rand(
-            (lora_nums, max_rank, hidden_size),  # col-major
-            dtype=dtype,
-        ).to(device)
-        # shrink op need atomic_add, so output is initinized by 0
-        ref_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=dtype,
-                                     device=inputs_tensor.device)
-        # NOTE  shrink kernel using torch.float32 as output type
-        our_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=torch.float32).to(device)
-    else:
-        inputs_tensor = torch.rand(
-            (total_tokens, max_rank),
-            dtype=dtype,
-        ).to(device)
-        lora_weights = torch.rand(
-            (lora_nums, hidden_size, max_rank),  # col-major
-            dtype=dtype,
-        ).to(device)
-        # expand op needs to complete y+=a@lora_b, so output is
-        # initinized randomly
-        ref_out_tensor = torch.rand(
-            (total_tokens, hidden_size),
-            dtype=dtype,
-        ).to(device)
-        # Ensure the same input.
-        our_out_tensor = ref_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, )).to(device)
-    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
-    current_offset = 0
-    for b_id in range(batches):
-        lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]].copy_(lora_index)
-        current_offset += seq_len_tensor[b_id].item()
-    return (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    )
-
-
-def _generate_data_expand_nslices(batches, hidden_size, lora_nums, max_rank,
-                                  seq_length, dtype, nslices, device):
-    try:
-        seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                       (batches, )).to(device)
-        b_seq_start_loc = torch.cumsum(
-            torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-            dim=0,
-        ).to(device)
-        total_tokens = seq_len_tensor.sum()
-        inputs_tensor = torch.rand(
-            (total_tokens, max_rank),
-            dtype=dtype,
-        ).to(device)
-        lora_weights_lst = []
-        for _ in range(nslices):
-            lora_weights_lst.append(
-                torch.rand(
-                    (lora_nums, hidden_size, max_rank),  # col-major
-                    dtype=dtype,
-                ).to(device))
-        # expand op needs to complete y+=a@lora_b, so output is
-        # initinized randomly
-        ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
-                                    dtype=dtype).to(device)
-        # Ensure the same input.
-        our_out_tensor = ref_out_tensor.clone()
-        lora_indices_tensor = torch.randint(
-            0, lora_nums - 1 if lora_nums > 1 else 1, (batches, ))
-        indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
-        current_offset = 0
-        for b_id in range(batches):
-            lora_index = lora_indices_tensor[b_id]
-            indices[current_offset:current_offset +
-                    seq_len_tensor[b_id]] = lora_index.item()
-            current_offset += seq_len_tensor[b_id].item()
-
-        lora_indices_tensor = lora_indices_tensor.to(device)
-        return (
-            inputs_tensor,
-            lora_weights_lst,
-            our_out_tensor,
-            ref_out_tensor,
-            b_seq_start_loc,
-            lora_indices_tensor,
-            seq_len_tensor,
-            indices,
-        )
-    except Exception as error:
-        raise error
-
-
 @pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
@@ -235,6 +138,7 @@ def test_punica_sgmv(
     batches: int,
     num_loras: int,
     rank: int,
+    hidden_size: int,
     scaling: float,
     dtype: torch.dtype,
     op_type: str,
@@ -247,10 +151,6 @@ def test_punica_sgmv(
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
 
-    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
-    hidden_size = HIDDEN_SIZES[hidden_size_index]
-    if hidden_size > 100000:
-        hidden_size = hidden_size // 4  # avoid OOM
     seq_length = 128
     (
         inputs_tensor,
@@ -261,8 +161,16 @@ def test_punica_sgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length,
-                       dtype, op_type, device)
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
     max_seq_length = seq_len_tensor.max()
     if isinstance(max_seq_length, tuple):
         max_seq_length = max_seq_length[0].item()
@@ -292,7 +200,7 @@ def test_punica_sgmv(
             max_seq_length,
             add_inputs=True,
         )
-    _torch_groupgemm(
+    ref_torch_groupgemm(
         ref_out_tensor,
         inputs_tensor,
         lora_weights,
@@ -310,6 +218,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
@@ -319,6 +228,7 @@ def test_punica_bgmv(
     batches: int,
     num_loras: int,
     rank: int,
+    hidden_size: int,
     scaling: float,
     dtype: torch.dtype,
     op_type: str,
@@ -327,16 +237,13 @@ def test_punica_bgmv(
 ):
     from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
     from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
+
     random.seed(seed)
     torch.set_default_device(device)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
 
-    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
-    hidden_size = HIDDEN_SIZES[hidden_size_index]
-    if hidden_size > 100000:
-        hidden_size = hidden_size // 4  # avoid OOM
     seq_length = 1
     (
         inputs_tensor,
@@ -347,14 +254,24 @@ def test_punica_bgmv(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data(batches, hidden_size, num_loras, rank, seq_length,
-                       dtype, op_type, device)
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
     if op_type == "shrink":
-        #The current _bgmv_shrink_kernel does not require the libentry
+        # The current _bgmv_shrink_kernel does not require the libentry
         # decoration. The purpose of adding this patch is to test the
         # correctness of libentry.
-        with patch("vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
-                   LibEntry(_bgmv_shrink_kernel)):
+        with patch(
+                "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
+                LibEntry(_bgmv_shrink_kernel),
+        ):
             bgmv_shrink(
                 inputs_tensor,
                 lora_weights,
@@ -363,9 +280,11 @@ def test_punica_bgmv(
                 scaling,
             )
     else:
-        #ditto
-        with patch("vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
-                   LibEntry(_bgmv_expand_kernel)):
+        # ditto
+        with patch(
+                "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
+                LibEntry(_bgmv_expand_kernel),
+        ):
             bgmv_expand(
                 inputs_tensor,
                 lora_weights,
@@ -373,7 +292,7 @@ def test_punica_bgmv(
                 indices,
                 add_inputs=True,
             )
-    _torch_groupgemm(
+    ref_torch_groupgemm(
         ref_out_tensor,
         inputs_tensor,
         lora_weights,
@@ -391,6 +310,7 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
@@ -400,6 +320,7 @@ def test_punica_expand_nslices(
     batches: int,
     num_loras: int,
     rank: int,
+    hidden_size: int,
     nslices: int,
     dtype: torch.dtype,
     op_type: str,
@@ -407,15 +328,12 @@ def test_punica_expand_nslices(
     device: str,
 ):
     from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
+
     random.seed(seed)
     torch.set_default_device(device)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    hidden_size_index = random.randint(0, len(HIDDEN_SIZES) - 1)
-    hidden_size = HIDDEN_SIZES[hidden_size_index]
-    if hidden_size > 100000:
-        hidden_size = hidden_size // 4  # avoid OOM
     seq_length = 128 if op_type == "sgmv" else 1
     (
         inputs_tensor,
@@ -426,7 +344,7 @@ def test_punica_expand_nslices(
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = _generate_data_expand_nslices(
+    ) = generate_data_for_expand_nslices(
         batches,
         hidden_size,
         num_loras,
@@ -459,12 +377,13 @@ def test_punica_expand_nslices(
                 add_inputs=True,
             )
         else:
-            #The current _bgmv_expand_slice_kernel does not require the
+            # The current _bgmv_expand_slice_kernel does not require the
             # libentry decoration. The purpose of adding this patch is to test
             # the correctness of libentry.
             with patch(
                     "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
-                    LibEntry(_bgmv_expand_slice_kernel)):
+                    LibEntry(_bgmv_expand_slice_kernel),
+            ):
                 bgmv_expand_slice(
                     inputs_tensor,
                     lora_weights,
@@ -474,7 +393,7 @@ def test_punica_expand_nslices(
                     slice_size=hidden_size,
                     add_inputs=True,
                 )
-        _torch_groupgemm(
+        ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
             lora_weights,
@@ -487,21 +406,3 @@ def test_punica_expand_nslices(
 
         slice_offset += hidden_size
     assert_close(our_outputs, ref_outputs)
-
-
-if __name__ == "__main__":
-    from itertools import product
-    lst = list(
-        product(
-            BATCHES,
-            NUM_LORA,
-            MAX_RANKS,
-            [1.0],
-            [torch.float16],
-            ["shrink"],
-            SEED,
-            CUDA_DEVICES,
-        ))
-    for ele in lst:
-        test_punica_bgmv(*ele)
-        print(f"{ele},pass")
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py
new file mode 100644
index 000000000000..7e73ea67ee5f
--- /dev/null
+++ b/tests/lora/test_punica_variation.py
@@ -0,0 +1,342 @@
+"""
+This script is mainly used to test whether trtion kernels can run normally 
+under different conditions, including various batches, numbers of LoRA , and 
+maximum ranks.
+"""
+import random
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.lora.ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.triton_utils.libentry import LibEntry
+
+from .utils import (generate_data, generate_data_for_expand_nslices,
+                    ref_torch_groupgemm)
+
+HIDDEN_SIZES = [3424, 4096, 4097]
+
+BATCHES = [1, 4, 16, 32]
+NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
+DTYPES = [torch.float16, torch.bfloat16]
+MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
+SCALES = [0.5]
+SEED = [0]
+CUDA_DEVICES = [f"cuda:{0}"]
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_sgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 128
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    if op_type == "shrink":
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            scaling,
+        )
+    else:
+        sgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            lora_indices_tensor,
+            batches,
+            max_seq_length,
+            add_inputs=True,
+        )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    scaling: float,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
+    from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    seq_length = 1
+    (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        op_type,
+        device,
+    )
+    if op_type == "shrink":
+        # The current _bgmv_shrink_kernel does not require the libentry
+        # decoration. The purpose of adding this patch is to test the
+        # correctness of libentry.
+        with patch(
+                "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
+                LibEntry(_bgmv_shrink_kernel),
+        ):
+            bgmv_shrink(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                scaling,
+            )
+    else:
+        # ditto
+        with patch(
+                "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
+                LibEntry(_bgmv_expand_kernel),
+        ):
+            bgmv_expand(
+                inputs_tensor,
+                lora_weights,
+                our_out_tensor,
+                indices,
+                add_inputs=True,
+            )
+    ref_torch_groupgemm(
+        ref_out_tensor,
+        inputs_tensor,
+        lora_weights,
+        lora_indices_tensor,
+        seq_len_tensor,
+        batches,
+        scaling if op_type == "shrink" else 1.0,
+        op_type,
+    )
+    if op_type == "shrink":
+        ref_out_tensor = ref_out_tensor.to(torch.float32)
+    assert_close(our_out_tensor, ref_out_tensor)
+
+
+@pytest.mark.parametrize("batches", BATCHES)
+@pytest.mark.parametrize("num_loras", NUM_LORA)
+@pytest.mark.parametrize("rank", MAX_RANKS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_punica_expand_nslices(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    op_type: str,
+    seed: int,
+    device: str,
+):
+    from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
+
+    random.seed(seed)
+    torch.set_default_device(device)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    seq_length = 128 if op_type == "sgmv" else 1
+    (
+        inputs_tensor,
+        lora_weights_lst,
+        our_outputs,
+        ref_outputs,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    ) = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+    max_seq_length = seq_len_tensor.max()
+    if isinstance(max_seq_length, tuple):
+        max_seq_length = max_seq_length[0].item()
+    else:
+        max_seq_length = max_seq_length.item()
+    slice_offset = 0
+    for index in range(nslices):
+        lora_weights = lora_weights_lst[index]
+        if op_type == "sgmv":
+            sgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                slice_offset,
+                hidden_size,
+                add_inputs=True,
+            )
+        else:
+            # The current _bgmv_expand_slice_kernel does not require the
+            # libentry decoration. The purpose of adding this patch is to test
+            # the correctness of libentry.
+            with patch(
+                    "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
+                    LibEntry(_bgmv_expand_slice_kernel),
+            ):
+                bgmv_expand_slice(
+                    inputs_tensor,
+                    lora_weights,
+                    our_outputs,
+                    indices,
+                    slice_offset,
+                    slice_size=hidden_size,
+                    add_inputs=True,
+                )
+        ref_torch_groupgemm(
+            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+            inputs_tensor,
+            lora_weights,
+            lora_indices_tensor,
+            seq_len_tensor,
+            batches,
+            1.0,
+            op_type="expand",
+        )
+
+        slice_offset += hidden_size
+    assert_close(our_outputs, ref_outputs)
+
+
+if __name__ == "__main__":
+    from itertools import product
+
+    lst = list(
+        product(
+            BATCHES,
+            NUM_LORA,
+            MAX_RANKS,
+            [1.0],
+            [torch.float16],
+            ["expand"],
+            SEED,
+            CUDA_DEVICES,
+        ))
+    for ele in lst:
+        test_punica_bgmv(*ele)
+        print(f"{ele},pass")
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index b73cf5bf5532..00f8e26d1041 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -86,3 +86,151 @@ def init_packed_lora(
         packed_lora = PackedLoRALayerWeights.pack(base_loras)
         self.set_module_lora(module_name, packed_lora)
         return packed_lora
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (6e-2, 6e-2),
+        torch.bfloat16: (6e-2, 6e-2),
+        torch.float32: (1e-2, 1e-2),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+def ref_torch_groupgemm(
+    out_tensor,
+    inputs,
+    lora_weights,
+    lora_indices_tensor,
+    seq_len_tensor,
+    batches,
+    scaling,
+    op_type,
+) -> torch.Tensor:
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batches), seq_len_tensor):
+        input_weight = inputs[current_offset:b_length + current_offset, :]
+        current_offset += b_length
+        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
+        result = torch.nn.functional.linear(input_weight, lora_weight)
+        result *= scaling
+        out_list.append(result)
+    cat_result = torch.cat(out_list, dim=0)
+    if op_type == "expand":
+        out_tensor += cat_result
+    else:
+        out_tensor.copy_(cat_result)
+    return
+
+
+def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
+                  op_type, device):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    if op_type == "shrink":
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, max_rank, hidden_size),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # shrink op need atomic_add, so output is initinized by 0
+        ref_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=dtype,
+                                     device=inputs_tensor.device)
+        # NOTE  shrink kernel using torch.float32 as output type
+        our_out_tensor = torch.zeros((total_tokens, max_rank),
+                                     dtype=torch.float32).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        lora_weights = torch.rand(
+            (lora_nums, hidden_size, max_rank),  # col-major
+            dtype=dtype,
+        ).to(device)
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        ref_out_tensor = torch.rand(
+            (total_tokens, hidden_size),
+            dtype=dtype,
+        ).to(device)
+        # Ensure the same input.
+        our_out_tensor = ref_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, )).to(device)
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]].copy_(lora_index)
+        current_offset += seq_len_tensor[b_id].item()
+    return (
+        inputs_tensor,
+        lora_weights,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
+                                     seq_length, dtype, nslices, device):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    inputs_tensor = torch.rand(
+        (total_tokens, max_rank),
+        dtype=dtype,
+    ).to(device)
+    lora_weights_lst = []
+    for _ in range(nslices):
+        lora_weights_lst.append(
+            torch.rand(
+                (lora_nums, hidden_size, max_rank),  # col-major
+                dtype=dtype,
+            ).to(device))
+    # expand op needs to complete y+=a@lora_b, so output is
+    # initinized randomly
+    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                dtype=dtype).to(device)
+    # Ensure the same input.
+    our_out_tensor = ref_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = lora_index.item()
+        current_offset += seq_len_tensor[b_id].item()
+
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return (
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py
index 2d09c7cfe6c8..dcaf2e3d462c 100644
--- a/vllm/lora/ops/bgmv_expand.py
+++ b/vllm/lora/ops/bgmv_expand.py
@@ -56,7 +56,7 @@ def _bgmv_expand_kernel(
             mask=offset_k < K,
             other=0,
         )  # [BLOCK_K]
-
+    # N must be divisible by SPLIT_N
     split_n_length = tl.cdiv(N, SPLIT_N)
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py
index 31b2cd545d3d..fa6571074f3a 100644
--- a/vllm/lora/ops/bgmv_expand_slice.py
+++ b/vllm/lora/ops/bgmv_expand_slice.py
@@ -57,7 +57,7 @@ def _bgmv_expand_slice_kernel(
             mask=offset_k < K,
             other=0,
         )  # [BLOCK_K]
-
+    # N must be divisible by SPLIT_N
     split_n_length = tl.cdiv(N, SPLIT_N)
     if CAST_TYPE:
         tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index 6a637288f71e..7c3e27313ad9 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -8,9 +8,25 @@ def _get_op_configs(op_type: str, batch: int, hidden_size: int):
     return None
 
 
+def _check_divisibility(hidden_size: int):
+    # The bgmv_expand kernel requires that the hidden_size be divisible by
+    # the number below.
+    divisibility = [2, 4, 8, 16, 32, 64]
+    divisibility.sort(reverse=True)
+    for div in divisibility:
+        if hidden_size % div == 0:
+            return div
+    # hidden_size is an odd number
+    return 1
+
+
 def _get_default_config(op_type: str, batch: int, hidden_size: int):
     if op_type == "expand":
-        return {"BLOCK_N": 256, "SPLIT_N": 64, "num_warps": 8}
+        return {
+            "BLOCK_N": 256,
+            "SPLIT_N": _check_divisibility(hidden_size),
+            "num_warps": 8
+        }
     else:
         return {"BLOCK_K": 256, "SPLIT_K": 64, "num_warps": 8}
 

From 9a4f1472cb1f70d0c2527c024e96e98f3a3ccf52 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Sun, 28 Jul 2024 21:55:44 +0800
Subject: [PATCH 70/71] fix docstring bug

---
 vllm/lora/fully_sharded_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index f751434bb7b4..a7887a048746 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -95,7 +95,7 @@ def can_replace_layer(
 def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
     """
     MergedColumnParallelLinearWithShardedLoRA and
-    QKVParallelLinearWithShardedLora share the same
+    MergedQKVParallelLinearWithShardedLora share the same
     LoRa weight application method.
     
     The main difference is the step by shard_size for lora_b which can

From 6620ffb7bd73449f464e88fa66b62a6bee0e81d8 Mon Sep 17 00:00:00 2001
From: jeejeeli <pandaleefree@gmail.com>
Date: Sun, 28 Jul 2024 22:38:02 +0800
Subject: [PATCH 71/71] modify max batches

---
 vllm/lora/models.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 9a9b4766cf41..017a1002bb9a 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -31,9 +31,6 @@
 
 _GLOBAL_LORA_ID = 0
 
-# NOTE This value comes from vllm/worker/model_runner.py
-_MAX_BATCH_SIZE = 256
-
 
 @dataclass
 class LongContextLoRAContext:
@@ -318,7 +315,7 @@ def __init__(
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
         self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
-                                            max_batches=_MAX_BATCH_SIZE,
+                                            max_batches=self.max_num_seqs,
                                             device="cuda")
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.