upd

cyx-6 · cyx-6 · commit 3f8a04d21b51 · 2024-06-18T00:30:21.000Z
diff --git a/src/runtime/relax_vm/paged_kv_cache.cc b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -178,10 +178,6 @@ struct Sequence {
       }
       block_ptr = block.parent_idx;
     }
-    CHECK_LE(depth, kPagedKVCacheMaxBlockDepth)
-        << "Paged KV cache supports one sequence to reuse " << kPagedKVCacheMaxBlockDepth
-        << " prefixes (the fork depth) at most. However, the given sequence has fork depth "
-        << depth;
   }
 
   std::vector<int32_t> GetBlockTrace(const std::vector<Block>& global_block_pool) const {
@@ -1490,19 +1486,29 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
       is_chain_ = true;
     }
 
-    std::vector<std::vector<int32_t>> block_ids_on_depths = GetBlockIdsOnDepth(sequences);
-    num_depths_ = block_ids_on_depths.size();
+    auto [block_ids_on_depths, trailing_blocks] = GetBlockIdsOnDepth(sequences);
+    num_depths_ =
+        std::min(static_cast<int>(block_ids_on_depths.size()), kPagedKVCacheMaxBlockDepth);
     ICHECK_LE(num_depths_, kPagedKVCacheMaxBlockDepth);
 
     std::vector<std::vector<std::pair<int32_t, int32_t>>> chunked_block_ids_arr;
     chunked_block_ids_arr.reserve(num_depths_);
     use_decode_kernel_.clear();
     for (int d = 0; d < num_depths_; ++d) {
-      auto [chunked_block_ids, use_decode_kernel] = GetChunkedBlockIds(block_ids_on_depths[d]);
+      // We force the blocks at maximum depth not to coalesce, so that it can be concatenated with
+      // trailing exceeding blocks.
+      auto [chunked_block_ids, use_decode_kernel] = GetChunkedBlockIds(
+          block_ids_on_depths[d], /*enable_coalesce=*/d != kPagedKVCacheMaxBlockDepth - 1);
       chunked_block_ids_arr.push_back(chunked_block_ids);
       use_decode_kernel_.push_back(use_decode_kernel);
     }
 
+    if (num_depths_ == kPagedKVCacheMaxBlockDepth) {
+      // Since we force the blocks at maximum depth not to coalesce, the output blocks at maximum
+      // depth must have the same size as current batch.
+      CHECK_EQ(chunked_block_ids_arr[num_depths_ - 1].size(), cur_batch_size_);
+    }
+
     append_before_attn_ = !support_sliding_window_ && num_depths_ == 1 && use_decode_kernel_[0];
     if (append_before_attn_) {
       // Right now we use different kernels when depth is 1 or not 1.
@@ -1530,7 +1536,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
       k_rope_pos_offset_h.clear();
       qo_indptr_h.push_back(0);
       page_indptr_h.push_back(0);
-      for (const auto& [block_id, chunk_append_length] : chunked_block_ids_arr[d]) {
+      for (int i = 0; i < static_cast<int>(chunked_block_ids_arr[d].size()); ++i) {
+        const auto& [block_id, chunk_append_length] = chunked_block_ids_arr[d][i];
         qo_indptr_h.push_back(qo_indptr_h.back() + chunk_append_length);
         if (block_id == -1) {
           page_indptr_h.push_back(page_indptr_h.back());
@@ -1539,19 +1546,53 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
           sink_size_h.push_back(0);
           k_rope_pos_offset_h.push_back(0);
         } else {
-          const Block& block = global_block_pool_[block_id];
-          page_indptr_h.push_back(page_indptr_h.back() + block.page_ids.size());
-          for (int32_t page_id : block.page_ids) {
-            page_indices_h.push_back(page_id);
+          if (d < kPagedKVCacheMaxBlockDepth - 1) {
+            // Blocks not at maximum depth
+            const Block& block = global_block_pool_[block_id];
+            page_indptr_h.push_back(page_indptr_h.back() + block.page_ids.size());
+            for (int32_t page_id : block.page_ids) {
+              page_indices_h.push_back(page_id);
+            }
+            last_page_len_h.push_back(
+                block.seq_length == 0
+                    ? 0
+                    : (block.seq_length - block.sink_length + block.sliding_window_offset - 1) %
+                              page_size_ +
+                          1);
+            sliding_window_offset_h.push_back(block.sliding_window_offset);
+            sink_size_h.push_back(block.sink_length);
+            k_rope_pos_offset_h.push_back(block.start_pos);
+          } else {
+            // Blocks at maximum depth
+            const Block& block = global_block_pool_[block_id];
+            int32_t num_pages = static_cast<int32_t>(block.page_ids.size());
+            int32_t total_seq_length = static_cast<int32_t>(block.seq_length);
+            int32_t last_block_id = block_id;
+            for (int32_t page_id : block.page_ids) {
+              page_indices_h.push_back(page_id);
+            }
+            for (int32_t id : trailing_blocks[i]) {
+              // Collect trailing blocks if available
+              const Block& block = global_block_pool_[id];
+              for (int32_t page_id : block.page_ids) {
+                page_indices_h.push_back(page_id);
+              }
+              num_pages += block.page_ids.size();
+              total_seq_length += block.seq_length;
+              last_block_id = id;
+            }
+            page_indptr_h.push_back(page_indptr_h.back() + num_pages);
+            const Block& last_block = global_block_pool_[last_block_id];
+            last_page_len_h.push_back(total_seq_length == 0
+                                          ? 0
+                                          : (total_seq_length - last_block.sink_length +
+                                             last_block.sliding_window_offset - 1) %
+                                                    page_size_ +
+                                                1);
+            sliding_window_offset_h.push_back(last_block.sliding_window_offset);
+            sink_size_h.push_back(last_block.sink_length);
+            k_rope_pos_offset_h.push_back(block.start_pos);
           }
-          last_page_len_h.push_back(block.seq_length == 0 ? 0
-                                                          : (block.seq_length - block.sink_length +
-                                                             block.sliding_window_offset - 1) %
-                                                                    page_size_ +
-                                                                1);
-          sliding_window_offset_h.push_back(block.sliding_window_offset);
-          sink_size_h.push_back(block.sink_length);
-          k_rope_pos_offset_h.push_back(block.start_pos);
         }
       }
     }
@@ -2035,22 +2076,34 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   /*!
    * \brief For the given list of sequences, check the block trace of
    * each sequence, and return the blocks ids used by the sequences
-   * on each depth.
+   * on each depth. And if the depth is larger than the kPagedKVCacheMaxBlockDepth,
+   * the exceeding blocks will concatenate and output separately.
    * More precisely, the inner returned vector contains the block ids
    * used by the sequences on a certain depth (or "-1" if a sequence
    * has fewer depth). The outer returned vector contains the inner
    * vectors from the lowest depth to the highest depth.
    */
-  std::vector<std::vector<int32_t>> GetBlockIdsOnDepth(
-      const std::vector<Sequence*>& sequences) const {
+  std::pair<std::vector<std::vector<int32_t>>, std::vector<std::vector<int32_t>>>
+  GetBlockIdsOnDepth(const std::vector<Sequence*>& sequences) const {
     // - Get the trace of each sequence.
     int64_t num_depths = 0;
     std::vector<std::vector<int32_t>> seq_block_traces;
+    std::vector<std::vector<int32_t>> trailing_block_traces;
     seq_block_traces.reserve(cur_batch_size_);
+    trailing_block_traces.reserve(cur_batch_size_);
     for (int i = 0; i < cur_batch_size_; ++i) {
       std::vector<int32_t> trace = sequences[i]->GetBlockTrace(global_block_pool_);
-      num_depths = std::max(num_depths, static_cast<int64_t>(trace.size()));
-      seq_block_traces.push_back(std::move(trace));
+      if (static_cast<int>(trace.size()) <= kPagedKVCacheMaxBlockDepth) {
+        seq_block_traces.push_back(std::vector<int32_t>(trace.begin(), trace.end()));
+        trailing_block_traces.push_back({});
+        num_depths = std::max(num_depths, static_cast<int64_t>(trace.size()));
+      } else {
+        seq_block_traces.push_back(
+            std::vector<int32_t>(trace.begin(), trace.begin() + kPagedKVCacheMaxBlockDepth));
+        trailing_block_traces.push_back(
+            std::vector<int32_t>(trace.begin() + kPagedKVCacheMaxBlockDepth, trace.end()));
+        num_depths = std::max(num_depths, static_cast<int64_t>(kPagedKVCacheMaxBlockDepth));
+      }
     }
 
     // "Transpose" the traces, yielding the block ids used on each depth.
@@ -2065,7 +2118,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
       }
       block_ids_on_depths.push_back(std::move(block_ids));
     }
-    return block_ids_on_depths;
+    return {block_ids_on_depths, trailing_block_traces};
   }
 
   /*!
@@ -2081,7 +2134,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
    * input blocks.
    */
   std::pair<std::vector<std::pair<int32_t, int32_t>>, bool> GetChunkedBlockIds(
-      const std::vector<int32_t>& block_ids) const {
+      const std::vector<int32_t>& block_ids, bool enable_coalesce = true) const {
     std::vector<std::pair<int32_t, int32_t>> uncoalesced_block_ids;
     std::vector<std::pair<int32_t, int32_t>> coalesced_block_ids;
 
@@ -2115,8 +2168,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     double coalesce_ratio = 1.0 * page_counter_uncoalesced / page_counter_coalesced;
     // Do not coalesce and use batch decode kernel when coalesce ratio is small.
     bool use_decode_kernel = is_decode_request_ && coalesce_ratio < 1.1;
-
-    return {use_decode_kernel ? uncoalesced_block_ids : coalesced_block_ids, use_decode_kernel};
+    return {use_decode_kernel || !enable_coalesce ? uncoalesced_block_ids : coalesced_block_ids,
+            use_decode_kernel};
   }
 
   /*! \brief Invoke the "begin forward" functions of underlying kernels. */
diff --git a/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_tir.py b/tests/python/relax/test_runtime_builtin_paged_attention_kv_cache_tir.py
@@ -581,7 +581,13 @@ def test_paged_attention_kv_cache_fork_sequence(kv_cache_and_config):
     apply_attention(kv_cache, rope_mode, [((11, 0, 60), 45), ((12, 0, 15), 14)], cached_k, cached_v)
     apply_attention(kv_cache, rope_mode, [((13, 0, 16), 19), ((14, 0, 17), 19)], cached_k, cached_v)
     apply_attention(kv_cache, rope_mode, [((15, 5, 60), 8), ((16, 5, 80), 10)], cached_k, cached_v)
-    apply_attention(kv_cache, rope_mode, [((17, 5, 75), 11), ((18, 5, 76), 45), ((19, 5, 77), 14)], cached_k, cached_v)
+    apply_attention(
+        kv_cache,
+        rope_mode,
+        [((17, 5, 75), 11), ((18, 5, 76), 45), ((19, 5, 77), 14)],
+        cached_k,
+        cached_v,
+    )
 
     operation_seq = [
         [(6, 1), (11, 1), (13, 1), (9, 1)],
@@ -607,6 +613,57 @@ def test_paged_attention_kv_cache_fork_sequence(kv_cache_and_config):
     assert fis_empty(kv_cache), "The KV cache is not empty after removing all sequences"
 
 
+@tvm.testing.requires_gpu
+@tvm.testing.requires_cuda
+def test_paged_attention_kv_cache_unlimited_depth(kv_cache_and_config):
+    kv_cache, rope_mode, support_sliding_window = kv_cache_and_config
+    if support_sliding_window and rope_mode == RopeMode.NORMAL:
+        # Normal RoPE mode under sliding window settings is not supported.
+        return
+    fclear(kv_cache)
+
+    cached_k = {}
+    cached_v = {}
+    apply_attention(kv_cache, rope_mode, [(0, 30)], cached_k, cached_v)
+    # Fork existing sequences.
+    apply_attention(kv_cache, rope_mode, [((1, 0, -1), 15)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((2, 1, -1), 5)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((3, 2, -1), 20)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((4, 3, -1), 26)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((5, 3, -1), 18)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((6, 5, -1), 22)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((7, 5, -1), 12)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((8, 7, -1), 29)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((9, 7, -1), 9)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((10, 9, -1), 31)], cached_k, cached_v)
+    apply_attention(kv_cache, rope_mode, [((11, 9, -1), 4)], cached_k, cached_v)
+    # 0 <- 1 <- 2 <- 3 <- 5 <- 7 <- 9 <- 11
+    #                |    |    |    |
+    #                4    6    8    10
+    # Decode.
+    operation_seq = [
+        [(3, 1), (6, 1), (9, 1)],
+        [(4, 1), (8, 1), (10, 1)],
+        [(5, 1), (7, 1), (11, 1)],
+    ]
+    for batch in operation_seq:
+        apply_attention(kv_cache, rope_mode, batch, cached_k, cached_v)
+
+    num_sequence = 12
+    for i in range(num_sequence):
+        fremove_sequence(kv_cache, i)
+        cached_k.pop(i)
+        cached_v.pop(i)
+        verify_cached_kv(
+            kv_cache,
+            seq_ids=list(range(i + 1, num_sequence)),
+            expected_k=cached_k,
+            expected_v=cached_v,
+        )
+
+    assert fis_empty(kv_cache), "The KV cache is not empty after removing all sequences"
+
+
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_paged_attention_kv_cache_popn(kv_cache_and_config):
@@ -2526,6 +2583,7 @@ def compact_kv_copy(
     for head_dim, dtype, rope_mode, support_sliding_window in itertools.product(
         HEAD_DIMS, DTYPES, ROPE_MODES, SUPPORT_SLIDING_WINDOW
     ):
+        print(head_dim, dtype, rope_mode, support_sliding_window)
         set_global_func(head_dim, dtype)
         cache = create_kv_cache(head_dim, dtype, rope_mode, support_sliding_window)
         cache_and_config = (cache, rope_mode, support_sliding_window)
@@ -2535,3 +2593,4 @@ def compact_kv_copy(
         test_paged_attention_kv_cache_popn(cache_and_config)
         test_paged_attention_kv_cache_sliding_window(cache_and_config)
         test_paged_attention_kv_cache_tree_attn(cache_and_config)
+        test_paged_attention_kv_cache_unlimited_depth(cache_and_config)