diff --git a/CMakeLists.txt b/CMakeLists.txt
index 180b896a7aba..3fa439fbbcf8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -940,6 +940,24 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
+#
+# _suffix_cache_C extension
+#
+set(VLLM_SUFFIX_CACHE_EXT_SRC
+  "csrc/suffix_cache/torch_bindings.cpp"
+  "csrc/suffix_cache/suffix_tree.cc")
+
+message(STATUS "Enabling suffix_cache extension.")
+define_gpu_extension_target(
+  _suffix_cache_C
+  DESTINATION vllm
+  LANGUAGE CXX
+  SOURCES ${VLLM_SUFFIX_CACHE_EXT_SRC}
+  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
+  USE_SABI 3
+  WITH_SOABI
+)
+
 if(VLLM_GPU_LANG STREQUAL "HIP")
   #
   # _rocm_C extension
diff --git a/csrc/suffix_cache/int32_map.h b/csrc/suffix_cache/int32_map.h
new file mode 100644
index 000000000000..8d62083842f1
--- /dev/null
+++ b/csrc/suffix_cache/int32_map.h
@@ -0,0 +1,376 @@
+// Copyright 2025 Snowflake Inc.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+/*
+ * A simple hash map with int32_t keys that's designed to be fast and compact:
+ *   - Open addressing with triangular probing allows high load factors.
+ *   - Iteration is very fast and cache-friendly to allow fast speculation.
+ *   - int32_t should be all we need to store token and sequence IDs.
+ */
+template <class T>
+class Int32Map {
+ public:
+  using const_iterator_value = std::pair<int32_t, const T&>;
+
+  Int32Map() = default;
+
+  Int32Map(Int32Map&& o) noexcept
+      : slots_(o.slots_),
+        cap_(o.cap_),
+        size_(o.size_),
+        tombstones_(o.tombstones_) {
+    o.slots_ = nullptr;
+    o.cap_ = o.size_ = o.tombstones_ = 0;
+  }
+
+  Int32Map& operator=(Int32Map&& o) noexcept {
+    if (this == &o) {
+      return *this;
+    }
+    destroy_all_();
+    delete[] slots_;
+    slots_ = o.slots_;
+    cap_ = o.cap_;
+    size_ = o.size_;
+    tombstones_ = o.tombstones_;
+    o.slots_ = nullptr;
+    o.cap_ = o.size_ = o.tombstones_ = 0;
+    return *this;
+  }
+
+  Int32Map(const Int32Map&) = delete;
+
+  Int32Map& operator=(const Int32Map&) = delete;
+
+  ~Int32Map() {
+    destroy_all_();
+    delete[] slots_;
+  }
+
+  bool empty() const noexcept { return size_ == 0; }
+
+  uint32_t size() const noexcept { return size_; }
+
+  bool contains(int32_t key) const {
+    if (key == KEY_EMPTY || key == KEY_TOMBSTONE) {
+      throw std::invalid_argument("invalid key");
+    }
+    if (!slots_) {
+      return false;
+    }
+    uint32_t idx;
+    return probe_insert_or_find_(key, idx);
+  }
+
+  bool erase(int32_t key) {
+    if (key == KEY_EMPTY || key == KEY_TOMBSTONE) {
+      throw std::invalid_argument("invalid key");
+    }
+    if (!slots_) {
+      return false;
+    }
+    uint32_t idx;
+    if (!probe_insert_or_find_(key, idx)) {
+      return false;
+    }
+    value_ptr_(slots_[idx])->~T();
+    slots_[idx].key = KEY_TOMBSTONE;
+    --size_;
+    ++tombstones_;
+    maybe_rehash_after_erase_();
+    return true;
+  }
+
+  // Construct in-place if absent, otherwise return existing.
+  template <class... Args>
+  T& emplace(int32_t key, Args&&... args) {
+    if (key == KEY_EMPTY || key == KEY_TOMBSTONE) {
+      throw std::invalid_argument("invalid key");
+    }
+
+    // Allocate minimal table if needed.
+    if (!slots_) {
+      cap_ = MIN_CAPACITY;
+      slots_ = new Slot[cap_];
+      size_ = tombstones_ = 0;
+    }
+
+    // Probe once.
+    uint32_t idx;
+    if (probe_insert_or_find_(key, idx)) {
+      return *value_ptr_(slots_[idx]);  // already present
+    }
+
+    // If we can reuse a tombstone, do it immediately without rehash.
+    if (slots_[idx].key == KEY_TOMBSTONE) {
+      --tombstones_;
+      slots_[idx].key = key;
+      ::new (static_cast<void*>(&slots_[idx].storage))
+          T(std::forward<Args>(args)...);
+      ++size_;
+      return *value_ptr_(slots_[idx]);
+    }
+
+    // We will use an EMPTY slot, which increases (size + tombstones).
+    if (static_cast<uint64_t>(cap_) * MAX_LOAD_PCT <
+        static_cast<uint64_t>(size_ + tombstones_ + 1) * 100) {
+      // Will exceed max load factor after insert, we need to rehash.
+      if (static_cast<uint64_t>(size_ + 1) * 100 <=
+          static_cast<uint64_t>(cap_) * MAX_LOAD_PCT) {
+        // Load would fit without tombstones, prefer same-cap cleanup.
+        rehash_(cap_);
+      } else {
+        // Grow capacity to the next power of 2.
+        rehash_(cap_ * 2);
+      }
+      // Re-probe after rehash.
+      bool found = probe_insert_or_find_(key, idx);
+      assert(!found);  // Re-hashing should not change the key set.
+    }
+
+    assert(slots_[idx].key == KEY_EMPTY);  // Must have an empty slot now.
+
+    slots_[idx].key = key;
+    ::new (static_cast<void*>(&slots_[idx].storage))
+        T(std::forward<Args>(args)...);
+    ++size_;
+    return *value_ptr_(slots_[idx]);
+  }
+
+  // Default-construct in-place if absent, otherwise return existing.
+  T& operator[](int32_t key) { return emplace(key); }
+
+  size_t memory_usage() const noexcept {
+    return sizeof(*this) + sizeof(Slot) * cap_;
+  }
+
+  class const_iterator {
+   public:
+    using value_type = const_iterator_value;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::forward_iterator_tag;
+
+    const_iterator() : m_(nullptr), i_(0) {}
+
+    const_iterator(const Int32Map* m, uint32_t i) : m_(m), i_(i) { advance_(); }
+
+    value_type operator*() const {
+      const Slot& s = m_->slots_[i_];
+      return {s.key, *m_->value_ptr_(s)};
+    }
+
+    const_iterator& operator++() {
+      ++i_;
+      advance_();
+      return *this;
+    }
+
+    bool operator==(const const_iterator& o) const {
+      return m_ == o.m_ && i_ == o.i_;
+    }
+
+    bool operator!=(const const_iterator& o) const { return !(*this == o); }
+
+   private:
+    void advance_() {
+      const uint32_t c = m_ ? m_->cap_ : 0u;
+      while (m_ && i_ < c && !m_->is_filled_(m_->slots_[i_].key)) {
+        ++i_;
+      }
+    }
+    const Int32Map* m_;
+    uint32_t i_;
+  };
+
+  const_iterator begin() const { return const_iterator(this, 0); }
+
+  const_iterator end() const { return const_iterator(this, cap_); }
+
+  const_iterator cbegin() const { return begin(); }
+
+  const_iterator cend() const { return end(); }
+
+ private:
+  // Reserved key representing an empty slot.
+  static constexpr int32_t KEY_EMPTY = INT32_MIN;
+
+  // Reserved key representing a deleted slot (tombstone).
+  static constexpr int32_t KEY_TOMBSTONE = INT32_MIN + 1;
+
+  // Keep 2 * MIN_LOAD_PCT < MAX_LOAD_PCT with some buffer to avoid thrashing.
+  static constexpr uint32_t MIN_LOAD_PCT = 25;
+  static constexpr uint32_t MAX_LOAD_PCT = 75;
+
+  // Capacity must be a power of 2 for triangular probing to cover all indices.
+  static constexpr uint32_t MIN_CAPACITY = 2;
+
+  struct Slot {
+    int32_t key;
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type storage;
+    Slot() noexcept : key(KEY_EMPTY) {}
+  };
+
+  // ----- Data (1 pointer + 3x uint32_t) -----
+  Slot* slots_ = nullptr;
+  uint32_t cap_ = 0;         // capacity, power of two (0 == unallocated)
+  uint32_t size_ = 0;        // number of FILLED
+  uint32_t tombstones_ = 0;  // number of TOMBSTONE
+
+  // ----- Helpers -----
+  static bool is_filled_(int32_t k) noexcept {
+    return k != KEY_EMPTY && k != KEY_TOMBSTONE;
+  }
+
+  static uint32_t mix_hash_(int32_t key) noexcept {
+    // 32-bit mix (Murmur-inspired)
+    uint32_t x = static_cast<uint32_t>(key);
+    x ^= x >> 16;
+    x *= 0x7feb352dU;
+    x ^= x >> 15;
+    x *= 0x846ca68bU;
+    x ^= x >> 16;
+    return x;
+  }
+
+  static T* value_ptr_(Slot& s) noexcept {
+    return std::launder(reinterpret_cast<T*>(&s.storage));
+  }
+
+  static const T* value_ptr_(const Slot& s) noexcept {
+    return std::launder(reinterpret_cast<const T*>(&s.storage));
+  }
+
+  void destroy_all_() noexcept {
+    if (!slots_) {
+      return;
+    }
+    for (uint32_t i = 0; i < cap_; ++i) {
+      if (is_filled_(slots_[i].key)) {
+        value_ptr_(slots_[i])->~T();
+        slots_[i].key = KEY_EMPTY;
+      }
+    }
+    size_ = tombstones_ = 0;
+  }
+
+  void maybe_rehash_after_erase_() {
+    if (!slots_) {
+      return;
+    }
+
+    // If completely empty: free everything and return.
+    if (size_ == 0) {
+      delete[] slots_;
+      slots_ = nullptr;
+      cap_ = size_ = tombstones_ = 0;
+      return;
+    }
+
+    // If too sparse, shrink by 1/2.
+    if (static_cast<uint64_t>(size_) * 100 <
+        static_cast<uint64_t>(cap_) * MIN_LOAD_PCT) {
+      if (cap_ / 2 >= MIN_CAPACITY) {
+        rehash_(cap_ / 2);
+      }
+    }
+  }
+
+  // Either finds existing (true, idx_out set) or returns best insert slot
+  // (false). On "not found", idx_out is: first tombstone if any; else first
+  // empty.
+  bool probe_insert_or_find_(int32_t key, uint32_t& idx_out) const {
+    assert(slots_ && cap_ > 0 && "probe on uninitialized map");
+    uint32_t idx = mix_hash_(key) & (cap_ - 1);
+    uint32_t step = 0;
+    bool has_first_tomb = false;
+    uint32_t first_tomb_idx = 0;
+    for (uint32_t probes = 0; probes < cap_; ++probes) {
+      int32_t k = slots_[idx].key;
+      if (k == key) {
+        idx_out = idx;
+        return true;
+      }
+      if (k == KEY_EMPTY) {
+        idx_out = has_first_tomb ? first_tomb_idx : idx;
+        return false;
+      }
+      if (k == KEY_TOMBSTONE && !has_first_tomb) {
+        first_tomb_idx = idx;
+        has_first_tomb = true;
+      }
+      ++step;
+      idx = (idx + step) & (cap_ - 1);  // triangular probing
+    }
+    if (!has_first_tomb) {
+      // This should never happen if load factor is correctly maintained.
+      throw std::runtime_error("Int32Map is full");
+    }
+    idx_out = first_tomb_idx;
+    return false;
+  }
+
+  template <class U>
+  static void place_new_(Slot* arr, uint32_t cap, int32_t key, U&& val) {
+    uint32_t idx = mix_hash_(key) & (cap - 1);
+    uint32_t step = 0;
+    for (uint32_t probes = 0; probes < cap; ++probes) {
+      int32_t k = arr[idx].key;
+      if (k == KEY_EMPTY || k == KEY_TOMBSTONE) {
+        arr[idx].key = key;
+        ::new (static_cast<void*>(&arr[idx].storage)) T(std::forward<U>(val));
+        return;
+      }
+      ++step;
+      idx = (idx + step) & (cap - 1);
+    }
+    assert(false && "rehash placement failed");
+  }
+
+  void rehash_(uint32_t new_cap) {
+    assert((new_cap & (new_cap - 1)) == 0 && new_cap >= MIN_CAPACITY);
+    Slot* fresh = new Slot[new_cap];  // keys default to KEY_EMPTY
+
+    if (slots_) {
+      for (uint32_t i = 0; i < cap_; ++i) {
+        auto& s = slots_[i];
+        if (!is_filled_(s.key)) {
+          continue;
+        }
+        int32_t k = s.key;
+        T* v = value_ptr_(s);
+        place_new_(fresh, new_cap, k, std::move(*v));
+        v->~T();
+        s.key = KEY_EMPTY;
+      }
+      delete[] slots_;
+    }
+
+    slots_ = fresh;
+    cap_ = new_cap;
+    tombstones_ = 0;  // cleaned
+                      // size_ unchanged
+  }
+};
diff --git a/csrc/suffix_cache/suffix_tree.cc b/csrc/suffix_cache/suffix_tree.cc
new file mode 100644
index 000000000000..cf6e7d49c43a
--- /dev/null
+++ b/csrc/suffix_cache/suffix_tree.cc
@@ -0,0 +1,544 @@
+// Copyright 2025 Snowflake Inc.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <iostream>
+#include <queue>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+#include "suffix_tree.h"
+
+#define CHECK_OR_RETURN(cond, msg) \
+  if (!(cond)) return msg;
+
+SuffixTree::SuffixTree(int max_depth)
+    : _max_depth(max_depth), _root(new Node()) {}
+
+// Append a new element to a new or existing sequence.
+void SuffixTree::append(int seq_id, int token) {
+  // Initialize the sequence if it doesn't exist.
+  _seqs.try_emplace(seq_id);
+  _active_nodes.try_emplace(seq_id);
+
+  // Insert a new active node at the root.
+  _active_nodes[seq_id].push_back(_root.get());
+  _root->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
+  _root->count += 1;
+
+  // Ensure the number of active nodes doesn't exceed max_depth.
+  if (_active_nodes[seq_id].size() > static_cast<size_t>(_max_depth)) {
+    _active_nodes[seq_id].pop_front();
+  }
+  _seqs[seq_id].push_back(token);
+
+  // Iterate over all active nodes for this sequence.
+  for (size_t i = 0; i < _active_nodes[seq_id].size(); ++i) {
+    Node* node = _active_nodes[seq_id][i];
+    Node* child = nullptr;
+    if (node->children.contains(token)) {
+      child = node->children[token].get();
+    }
+
+    assert(node->endpoints.contains(seq_id));
+    assert(node->endpoints[seq_id] == _seqs[seq_id].size() - 1);
+
+    if (child == nullptr) {
+      // No existing child node for the new token.
+      if (node->count == 1 && node != _root.get()) {
+        // The active node has count = 1, which means the only suffix that ends
+        // here is the one that's being extended right now. Then this node
+        // should be a leaf node, and we can simply extend the length of this
+        // node.
+        assert(node->children.empty());
+        assert(node->ref_seq == seq_id);
+        node->length += 1;
+        node->endpoints[seq_id] += 1;
+      } else {
+        // Either this is the root node, or the current suffix is not the only
+        // one that ends here. Either case, we need to extend the current suffix
+        // into a new child.
+        Node* new_child = new Node();
+        new_child->token = token;
+        new_child->parent = node;
+        new_child->count = 1;
+        new_child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
+        new_child->ref_seq = seq_id;
+        new_child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - 1;
+        new_child->length = 1;
+        node->children.emplace(token, new_child);
+        node->endpoints.erase(seq_id);
+        _active_nodes[seq_id][i] = new_child;
+      }
+    } else if (node->count == child->count + 1 && node != _root.get()) {
+      // The active node has a child for the new token, and the child's count is
+      // exactly one fewer than the active node's count. Since the suffix for
+      // the active node ends here, that means all other suffixes that pass
+      // through this node must go to that child.
+      assert(node->children.size() ==
+             1);  // The active node should have only one child.
+      assert(node->endpoints.size() ==
+             1);  // Only the current suffix should end here.
+      if (child->length == 1) {
+        // The child only has length 1. If we append the new token to the
+        // current suffix, then it will perfectly overlap with the child. In
+        // this case, we should just fuse the current suffix into the child and
+        // eliminate the current node.
+        Node* parent = node->parent;
+        // Update child to take the place of the current node.
+        child->token = node->token;
+        child->count += 1;  // Current suffix extends into the child
+        child->length = node->length + 1;
+        child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
+        child->ref_seq = seq_id;
+        child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - child->length;
+        child->parent = parent;
+        // Give ownership of child pointer to parent and should also free the
+        // current node.
+        assert(parent->children.contains(child->token));
+        assert(parent->children[child->token].get() == node);
+        Node* tmp = node->children[token].release();
+        parent->children[child->token].reset(tmp);
+        // Replace active node with child node.
+        _active_nodes[seq_id][i] = child;
+      } else {
+        // The child has length > 1. If we append the new token to the current
+        // suffix, then it still does not reach the child node. In this case, we
+        // keep both nodes but extend the length of the current node by 1 into
+        // the child node.
+        node->length += 1;
+        node->endpoints[seq_id] += 1;
+        node->ref_seq = seq_id;
+        node->ref_idx = static_cast<int>(_seqs[seq_id].size()) - node->length;
+        child->length -= 1;
+        child->ref_idx += 1;
+        // The child node's first token should be updated to its second token.
+        child->token = _seqs[child->ref_seq][child->ref_idx];
+        if (child->token != token) {
+          Node* tmp = node->children[token].release();
+          node->children.emplace(child->token, tmp);
+          node->children.erase(token);
+        }
+      }
+    } else {
+      // There is a child for the new token, and should move the active node
+      // into that child.
+      if (child->length == 1) {
+        // The child node has length 1, just update the active node pointer to
+        // it.
+        node->endpoints.erase(seq_id);
+        child->count += 1;
+        child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
+        child->ref_seq = seq_id;
+        child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - 1;
+        _active_nodes[seq_id][i] = child;
+      } else {
+        // The child node has length > 1. If we extend the current suffix into
+        // it, then it must be split into a segment of length 1 and another
+        // segment with the remainder.
+        Node* new_node = new Node();
+        new_node->token = token;
+        new_node->count = child->count + 1;
+        new_node->parent = node;
+        new_node->length = 1;
+        new_node->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
+        new_node->ref_seq = seq_id;
+        new_node->ref_idx =
+            static_cast<int>(_seqs[seq_id].size()) - new_node->length;
+        // The child node's first token should be updated to its second token.
+        child->token = _seqs[child->ref_seq][child->ref_idx + 1];
+        Node* tmp = node->children[token].release();
+        new_node->children.emplace(child->token, tmp);
+        node->children[token].reset(new_node);
+        node->endpoints.erase(seq_id);
+        child->parent = new_node;
+        child->length -= 1;
+        child->ref_idx += 1;
+        _active_nodes[seq_id][i] = new_node;
+      }
+    }
+  }
+}
+
+// Extend a new or existing sequence.
+void SuffixTree::extend(int seq_id, const std::vector<int>& tokens) {
+  for (int token : tokens) {
+    append(seq_id, token);
+  }
+}
+
+// Remove an existing sequence.
+void SuffixTree::remove(int seq_id) {
+  const std::vector<int>& seq = _seqs[seq_id];
+  std::vector<Node*> path;  // Declare here to avoid repeated allocations.
+  // Loop through all suffix starting indices.
+  for (int start = 0; start < seq.size(); start++) {
+    Node* node = _root.get();
+    node->count--;
+    int idx = start;
+    path.clear();
+    // Loop through the nodes for this suffix.
+    while (idx < seq.size()) {
+      int token = seq[idx];
+      if (!node->children.contains(token)) {
+        break;
+      }
+      Node* child = node->children[token].get();
+      assert(child->count > 0);
+      child->count--;
+      if (child->count == 0) {
+        node->children.erase(token);
+        break;
+      }
+      if (child->endpoints.contains(seq_id)) {
+        child->endpoints.erase(seq_id);
+      }
+      idx += child->length;
+      node = child;
+      path.push_back(node);
+    }
+    // The last visited node may be mergeable with its child.
+    if (node != _root.get() && node->children.size() == 1) {
+      const auto& it = *node->children.begin();
+      std::unique_ptr<Node>& child_uptr = node->children[it.first];
+      if (node->count == child_uptr->count) {
+        // Merge node into child.
+        child_uptr->token = node->token;
+        child_uptr->length += node->length;
+        child_uptr->ref_idx -= node->length;
+        child_uptr->parent = node->parent;
+        path.back() = node = child_uptr.release();
+        node->parent->children[node->token].reset(node);
+      }
+    }
+    // ref_seq and ref_idx of all nodes in the path may need to be updated.
+    // 1. Go to an arbitrary leaf to get its endpoints.
+    Node* leaf = node;
+    int distance = 0;  // Distance from node to leaf.
+    while (!leaf->children.empty()) {
+      leaf = (*leaf->children.begin()).second.get();
+      distance += leaf->length;
+    }
+    // 2. Pick an arbitrary endpoint for the reference sequence and index.
+    if (leaf->endpoints.empty() || leaf->endpoints.contains(seq_id)) {
+      // Still need to visit this leaf later when removing this sequence.
+      // We can skip updating the refs until the next time it's visited.
+      continue;
+    }
+    const auto& ref = *leaf->endpoints.begin();
+    // 3. Go back up the path to update all nodes' refs.
+    int32_t ref_seq = ref.first;
+    int32_t ref_idx = ref.second - distance;
+    while (!path.empty()) {
+      Node* n = path.back();
+      path.pop_back();
+      ref_idx -= n->length;
+      if (n->ref_seq == seq_id) {
+        n->ref_seq = ref_seq;
+        n->ref_idx = ref_idx;
+      }
+    }
+  }
+  _seqs.erase(seq_id);
+  _active_nodes.erase(seq_id);
+}
+
+Candidate SuffixTree::speculate(const std::vector<int>& pattern,
+                                int max_spec_tokens, float max_spec_factor,
+                                float max_spec_offset, float min_token_prob,
+                                bool use_tree_spec) {
+  Candidate result;
+  int start_idx = std::max(static_cast<int>(pattern.size()) - _max_depth, 0);
+  for (; start_idx < pattern.size(); start_idx++) {
+    auto [node, idx] = _match_pattern(pattern, start_idx);
+    if (node == nullptr) {
+      continue;
+    }
+    int match_len = static_cast<int>(pattern.size()) - start_idx;
+    int max_tokens = std::min(
+        max_spec_tokens,
+        static_cast<int>(match_len * max_spec_factor + max_spec_offset + 1e-6));
+    max_tokens = std::max(max_tokens, 0);
+    Candidate candidate;
+    if (use_tree_spec) {
+      candidate = _speculate_tree(node, idx, max_tokens, min_token_prob);
+    } else {
+      candidate = _speculate_path(node, idx, max_tokens, min_token_prob);
+    }
+    if (candidate.score > result.score) {
+      result = std::move(candidate);
+      result.match_len = match_len;
+    }
+  }
+  return result;
+}
+
+std::string SuffixTree::check_integrity() {
+  // 1. Check structural integrity of all nodes.
+  std::queue<Node*> queue;
+  queue.push(_root.get());
+  while (!queue.empty()) {
+    Node* node = queue.front();
+    queue.pop();
+    std::string ret = _check_node_integrity(node);
+    if (!ret.empty()) {
+      return ret;
+    }
+    for (const auto& [token, child] : node->children) {
+      queue.push(child.get());
+    }
+  }
+  // 2. Check all sequences are represented in the tree.
+  std::unordered_map<Node*, int64_t> visit_count;
+  for (int seq_id = 0; seq_id < _seqs.size(); seq_id++) {
+    const std::vector<int>& seq = _seqs[seq_id];
+    // Loop through all suffix starting indices.
+    for (int start = 0; start < seq.size(); start++) {
+      int idx = start;
+      // Traverse the tree along this suffix.
+      Node* node = _root.get();
+      visit_count[node]++;
+      while (idx < seq.size() && idx - start < _max_depth) {
+        CHECK_OR_RETURN(node->children.contains(seq[idx]),
+                        "missing child node for sequence");
+        node = node->children[seq[idx]].get();
+        visit_count[node]++;
+        CHECK_OR_RETURN(idx + node->length <= seq.size(),
+                        "path exceeds sequence length");
+        for (int i = 0; i < node->length; ++i) {
+          int ref_seq = node->ref_seq;
+          int ref_idx = node->ref_idx + i;
+          CHECK_OR_RETURN(seq[idx + i] == _seqs[ref_seq][ref_idx],
+                          "path does not match sequence tokens");
+        }
+        idx += node->length;
+      }
+      // The last node on this path should have an endpoint.
+      CHECK_OR_RETURN(node->endpoints.contains(seq_id),
+                      "missing endpoint for sequence");
+    }
+  }
+  // 3. Check all nodes were visited the correct number of times.
+  assert(queue.empty());
+  queue.push(_root.get());
+  while (!queue.empty()) {
+    Node* node = queue.front();
+    queue.pop();
+    CHECK_OR_RETURN(node->count == visit_count[node],
+                    "node count does not match visit count");
+    for (const auto& [token, child] : node->children) {
+      queue.push(child.get());
+    }
+  }
+  return "";
+}
+
+std::string SuffixTree::_check_node_integrity(Node* node) {
+  int64_t children_count = 0;
+  for (const auto& [token, child] : node->children) {
+    // Do all my children have me as their parent?
+    CHECK_OR_RETURN(child->parent == node,
+                    "child node has incorrect parent pointer");
+    children_count++;
+  }
+  // Is my counter at least the sum of my childrens' counters?
+  CHECK_OR_RETURN(children_count <= node->count,
+                  "node count is less than sum children counts");
+  if (node == _root.get()) {
+    // Root node can stop here after some simple checks.
+    CHECK_OR_RETURN(node->count >= 0, "root node has negative count");
+    CHECK_OR_RETURN(node->parent == nullptr,
+                    "root node has non-null parent pointer");
+    CHECK_OR_RETURN(node->length == 0, "root node has non-zero length");
+    CHECK_OR_RETURN(node->endpoints.empty(),
+                    "root node has non-empty endpoints");
+    CHECK_OR_RETURN(node->ref_idx == -1, "root node has invalid ref_idx");
+    return "";
+  }
+  // Is my length positive? Otherwise, I shouldn't exist.
+  CHECK_OR_RETURN(node->length > 0, "internal node has non-positive length");
+  // Is my count positive? Otherwise, I shouldn't exist.
+  CHECK_OR_RETURN(node->count > 0, "internal node has non-positive count");
+  // Are all my children's counts less than mine? If equal, then we should have
+  // been merged.
+  for (const auto& [token, child] : node->children) {
+    CHECK_OR_RETURN(child->count < node->count,
+                    "internal node count is not greater than child count");
+  }
+  // Check my reference sequence and index.
+  CHECK_OR_RETURN(_seqs.count(node->ref_seq),
+                  "internal node has invalid ref_seq");
+  CHECK_OR_RETURN(node->ref_idx >= 0, "internal node has invalid ref_idx");
+  CHECK_OR_RETURN(node->ref_idx + node->length <= _seqs[node->ref_seq].size(),
+                  "internal node has invalid token range");
+  // Check my first token is correct.
+  CHECK_OR_RETURN(node->token == _seqs[node->ref_seq][node->ref_idx],
+                  "internal node has incorrect first token");
+  // Check I am my parent's child.
+  CHECK_OR_RETURN(node->parent->children.contains(node->token),
+                  "internal node is not a child of parent node");
+  CHECK_OR_RETURN(node->parent->children[node->token].get() == node,
+                  "parent node has incorrect child pointer");
+  // Check all my endpoint references are correct.
+  for (auto [seq_id, end_idx] : node->endpoints) {
+    CHECK_OR_RETURN(_seqs.count(seq_id),
+                    "node endpoint refers to nonexistent sequence");
+    CHECK_OR_RETURN(end_idx > 0 && end_idx <= _seqs[seq_id].size(),
+                    "invalid endpoint index");
+    // Check all tokens from the start of the suffix to the endpoint.
+    Node* n = node;
+    int idx = end_idx;
+    do {
+      CHECK_OR_RETURN(n->length <= idx, "invalid endpoint length");
+      idx -= n->length;
+      for (int i = 0; i < n->length; ++i) {
+        int tok = _seqs[n->ref_seq][n->ref_idx + i];
+        CHECK_OR_RETURN(_seqs[seq_id][idx + i] == tok,
+                        "invalid endpoint token");
+      }
+      n = n->parent;
+    } while (n != nullptr);
+  }
+  return "";
+}
+
+std::pair<Node*, int> SuffixTree::_match_pattern(
+    const std::vector<int>& pattern, int start_idx) {
+  Node* node = _root.get();
+  int idx = 0;
+  for (int i = start_idx; i < pattern.size(); i++) {
+    int c = pattern[i];
+    if (idx >= node->length) {
+      if (!node->children.contains(c)) {
+        return {nullptr, -1};
+      }
+      node = node->children[c].get();
+      idx = 0;
+    }
+    assert(idx < node->length);
+    if (_seqs[node->ref_seq][node->ref_idx + idx] != c) {
+      return {nullptr, -1};
+    }
+    idx++;
+  }
+  return {node, idx};
+}
+
+Candidate SuffixTree::_speculate_path(Node* node, int idx, int max_spec_tokens,
+                                      float min_token_prob) {
+  Candidate ret;
+  float prob = 1.0f;
+  while (ret.token_ids.size() < max_spec_tokens && prob >= min_token_prob) {
+    if (idx < node->length) {
+      // Use previous token index as parent; if none, mark as -1.
+      ret.parents.push_back(static_cast<int>(ret.token_ids.size()) - 1);
+      int token = _seqs[node->ref_seq][node->ref_idx + idx];
+      ret.token_ids.push_back(token);
+      ret.probs.push_back(prob);
+      ret.score += prob;
+      idx++;
+    } else {
+      Node* child = nullptr;
+      int64_t count = 0;
+      // Choose the child with the maximum count.
+      for (const auto& kv : node->children) {
+        Node* ch = kv.second.get();
+        if (ch->count > count) {
+          child = ch;
+          count = ch->count;
+        }
+      }
+      if (child == nullptr) {
+        break;
+      }
+      prob *= static_cast<float>(count) / node->count;
+      node = child;
+      idx = 0;
+    }
+  }
+  return ret;
+}
+
+struct HeapItem {
+  float prob;
+  Node* node;
+  int idx;
+  int parent;  // index in the candidate token list; -1 if none.
+
+  HeapItem(float p, Node* n, int i, int par)
+      : prob(p), node(n), idx(i), parent(par) {}
+};
+
+struct HeapItemCompare {
+  bool operator()(const HeapItem& a, const HeapItem& b) const {
+    // In C++ priority_queue by default returns the largest element.
+    // Thus, we compare probabilities so that the highest prob is returned.
+    return a.prob < b.prob;
+  }
+};
+
+// Get a candidate token tree using a priority queue.
+Candidate SuffixTree::_speculate_tree(Node* node, int idx, int max_spec_tokens,
+                                      float min_token_prob) {
+  Candidate ret;
+  std::priority_queue<HeapItem, std::vector<HeapItem>, HeapItemCompare> queue;
+  queue.emplace(1.0, node, idx, -1);
+  while (ret.token_ids.size() < max_spec_tokens && !queue.empty()) {
+    HeapItem item = queue.top();
+    queue.pop();
+    if (item.idx < item.node->length) {
+      int token = _seqs[item.node->ref_seq][item.node->ref_idx + item.idx];
+      ret.token_ids.push_back(token);
+      ret.parents.push_back(item.parent);
+      ret.probs.push_back(item.prob);
+      ret.score += item.prob;
+      queue.emplace(item.prob, item.node, item.idx + 1,
+                    static_cast<int>(ret.token_ids.size()) - 1);
+    } else {
+      for (const auto& kv : item.node->children) {
+        Node* child = kv.second.get();
+        float prob =
+            item.prob * child->count / static_cast<float>(item.node->count);
+        if (prob >= min_token_prob) {
+          queue.emplace(prob, child, 0, item.parent);
+        }
+      }
+    }
+  }
+  return ret;
+}
+
+size_t SuffixTree::estimate_memory() const {
+  size_t total = sizeof(*this);
+  std::vector<Node*> stack;
+  stack.push_back(_root.get());
+  while (!stack.empty()) {
+    Node* node = stack.back();
+    stack.pop_back();
+    total += node->memory_usage();
+    for (const auto& [token, child] : node->children) {
+      stack.push_back(child.get());
+    }
+  }
+  for (const auto& [seq_id, seq] : _seqs) {
+    total += sizeof(decltype(seq)::value_type) * seq.capacity();
+  }
+  for (const auto& [seq_id, active_nodes] : _active_nodes) {
+    total += sizeof(decltype(active_nodes)::value_type) * active_nodes.size();
+  }
+  return total;
+}
diff --git a/csrc/suffix_cache/suffix_tree.h b/csrc/suffix_cache/suffix_tree.h
new file mode 100644
index 000000000000..a7b304a668be
--- /dev/null
+++ b/csrc/suffix_cache/suffix_tree.h
@@ -0,0 +1,132 @@
+// Copyright 2025 Snowflake Inc.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "int32_map.h"
+
+struct Node {
+  // Token referenced by this node. Node can refer to a sequence of tokens,
+  // this is just the ID of the first token.
+  int token = 0;
+
+  // Number of suffixes from the root that end at or pass through this node.
+  int64_t count = 0;
+
+  // Parent node.
+  Node* parent = nullptr;
+
+  // Children nodes, the key should always be the first token of the child.
+  Int32Map<std::unique_ptr<Node>> children;
+
+  // Maps sequence ID -> index of the end of the suffix in that sequence.
+  Int32Map<int> endpoints;
+
+  // Reference sequence ID and starting index for the tokens in this node.
+  int ref_seq = 0;
+  int ref_idx = -1;
+
+  // Number of tokens in this node.
+  int length = 0;
+
+  // Memory usage of this node.
+  size_t memory_usage() const {
+    size_t total = sizeof(*this);
+    total += children.memory_usage();
+    total += endpoints.memory_usage();
+    return total;
+  }
+};
+
+struct Candidate {
+  // The token ids of the speculation candidate.
+  std::vector<int> token_ids;
+
+  // For each token, the index of its parent token (-1 if no parent).
+  std::vector<int> parents;
+
+  // For each token, the estimated probability of the token.
+  std::vector<float> probs;
+
+  // Floating point score of the candidate (sum of all probs).
+  float score = 0.0;
+
+  // Length of the prefix match for the speculated tokens.
+  int match_len = 0;
+};
+
+class SuffixTree {
+ public:
+  SuffixTree(int max_depth);
+
+  int num_seqs() const { return static_cast<int>(_seqs.size()); }
+
+  // Append a new element to the sequence with id seq_id.
+  void append(int seq_id, int token);
+
+  // Append multiple new elements to the sequence with id seq_id.
+  void extend(int seq_id, const std::vector<int>& tokens);
+
+  // Remove the sequence with id seq_id.
+  void remove(int seq_id);
+
+  // Given a pattern, speculate the next tokens using the suffix tree.
+  Candidate speculate(const std::vector<int>& pattern, int max_spec_tokens,
+                      float max_spec_factor = 1.0f,
+                      float max_spec_offset = 0.0f, float min_token_prob = 0.1f,
+                      bool use_tree_spec = false);
+
+  // Check the integrity of the suffix tree, return empty string if ok,
+  // otherwise return an error message.
+  std::string check_integrity();
+
+  // Estimate memory usage of the suffix tree, for debugging only. It
+  // walks the entire tree so can be slow.
+  size_t estimate_memory() const;
+
+ private:
+  // Maximum depth of the suffix tree.
+  int _max_depth;
+
+  // The root node of the suffix tree.
+  std::unique_ptr<Node> _root;
+
+  // Mapping from seq id to its sequence (vector of ints).
+  std::unordered_map<int, std::vector<int>> _seqs;
+
+  // For each sequence, a sliding window of active nodes. Maintains at most
+  // _max_depth active nodes for each sequence. Queue is shifted when a new
+  // token is added to the sequence. Each active node is in the queue for at
+  // most _max_depth iterations before being removed.
+  std::unordered_map<int, std::deque<Node*>> _active_nodes;
+
+  std::pair<Node*, int> _match_pattern(const std::vector<int>& pattern,
+                                       int start_idx = 0);
+
+  Candidate _speculate_path(Node* node, int idx, int max_spec_tokens,
+                            float min_token_prob);
+
+  Candidate _speculate_tree(Node* node, int idx, int max_spec_tokens,
+                            float min_token_prob);
+
+  std::string _check_node_integrity(Node* node);
+};
diff --git a/csrc/suffix_cache/torch_bindings.cpp b/csrc/suffix_cache/torch_bindings.cpp
new file mode 100644
index 000000000000..efb8fda068ba
--- /dev/null
+++ b/csrc/suffix_cache/torch_bindings.cpp
@@ -0,0 +1,262 @@
+// Copyright 2025 Snowflake Inc.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/library.h>
+#include <torch/types.h>
+#include <ATen/ATen.h>
+#include <ATen/core/jit_type.h>
+
+#include "suffix_tree.h"
+#include "core/registration.h"
+
+// Register custom types with PyTorch
+namespace {
+c10::intrusive_ptr<c10::ivalue::Object> make_candidate(
+    const std::vector<int64_t>& token_ids, const std::vector<int64_t>& parents,
+    const std::vector<double>& probs, double score, int64_t match_len) {
+  // Create a ClassType with named attributes
+  auto classType = c10::ClassType::create(
+      "_suffix_cache.Candidate", std::weak_ptr<torch::jit::CompilationUnit>());
+
+  // Add attributes to the class type
+  classType->addAttribute("token_ids",
+                          c10::ListType::create(c10::IntType::get()));
+  classType->addAttribute("parents",
+                          c10::ListType::create(c10::IntType::get()));
+  classType->addAttribute("probs",
+                          c10::ListType::create(c10::FloatType::get()));
+  classType->addAttribute("score", c10::FloatType::get());
+  classType->addAttribute("match_len", c10::IntType::get());
+
+  // Create the object with 5 slots for attributes
+  auto obj =
+      c10::ivalue::Object::create(c10::StrongTypePtr(nullptr, classType), 5);
+
+  // Set attributes by name
+  obj->setAttr("token_ids", c10::List<int64_t>(token_ids));
+  obj->setAttr("parents", c10::List<int64_t>(parents));
+  obj->setAttr("probs", c10::List<double>(probs));
+  obj->setAttr("score", score);
+  obj->setAttr("match_len", match_len);
+
+  return obj;
+}
+
+// Wrapper functions for SuffixTree operations
+class SuffixTreeWrapper {
+  std::unique_ptr<SuffixTree> tree_;
+
+ public:
+  explicit SuffixTreeWrapper(int64_t max_depth)
+      : tree_(std::make_unique<SuffixTree>(static_cast<int>(max_depth))) {}
+
+  int64_t num_seqs() const { return static_cast<int64_t>(tree_->num_seqs()); }
+
+  void append(int64_t seq_id, int64_t token) {
+    tree_->append(static_cast<int>(seq_id), static_cast<int>(token));
+  }
+
+  void extend(int64_t seq_id, const std::vector<int64_t>& tokens) {
+    std::vector<int> int_tokens;
+    int_tokens.reserve(tokens.size());
+    for (int64_t token : tokens) {
+      int_tokens.push_back(static_cast<int>(token));
+    }
+    tree_->extend(static_cast<int>(seq_id), int_tokens);
+  }
+
+  void remove(int64_t seq_id) { tree_->remove(static_cast<int>(seq_id)); }
+
+  c10::intrusive_ptr<c10::ivalue::Object> speculate(
+      const std::vector<int64_t>& pattern, int64_t max_spec_tokens,
+      double max_spec_factor, double max_spec_offset, double min_token_prob,
+      bool use_tree_spec) {
+    std::vector<int> int_pattern;
+    int_pattern.reserve(pattern.size());
+    for (int64_t token : pattern) {
+      int_pattern.push_back(static_cast<int>(token));
+    }
+
+    Candidate result =
+        tree_->speculate(int_pattern, static_cast<int>(max_spec_tokens),
+                         static_cast<float>(max_spec_factor),
+                         static_cast<float>(max_spec_offset),
+                         static_cast<float>(min_token_prob), use_tree_spec);
+
+    // Convert Candidate to PyTorch custom type
+    std::vector<int64_t> token_ids(result.token_ids.begin(),
+                                   result.token_ids.end());
+    std::vector<int64_t> parents(result.parents.begin(), result.parents.end());
+    std::vector<double> probs(result.probs.begin(), result.probs.end());
+
+    return make_candidate(token_ids, parents, probs,
+                          static_cast<double>(result.score),
+                          static_cast<int64_t>(result.match_len));
+  }
+
+  std::string check_integrity() { return tree_->check_integrity(); }
+
+  int64_t estimate_memory() const {
+    return static_cast<int64_t>(tree_->estimate_memory());
+  }
+};
+
+// Shim functions for TORCH_LIBRARY registration
+torch::Tensor suffix_tree_create(int64_t max_depth) {
+  auto wrapper = std::make_unique<SuffixTreeWrapper>(max_depth);
+  void* ptr = wrapper.release();
+
+  // Store the pointer in a tensor (this is a common pattern in vLLM)
+  // We use a CPU int64 tensor to store the pointer
+  auto options =
+      torch::TensorOptions().dtype(torch::kInt64).device(torch::kCPU);
+  auto tensor = torch::empty({1}, options);
+  tensor.data_ptr<int64_t>()[0] = reinterpret_cast<int64_t>(ptr);
+  return tensor;
+}
+
+void suffix_tree_destroy(torch::Tensor handle) {
+  int64_t ptr_value = handle.data_ptr<int64_t>()[0];
+  auto* wrapper = reinterpret_cast<SuffixTreeWrapper*>(ptr_value);
+  delete wrapper;
+}
+
+int64_t suffix_tree_num_seqs(torch::Tensor handle) {
+  int64_t ptr_value = handle.data_ptr<int64_t>()[0];
+  auto* wrapper = reinterpret_cast<SuffixTreeWrapper*>(ptr_value);
+  return wrapper->num_seqs();
+}
+
+void suffix_tree_append(torch::Tensor handle, int64_t seq_id, int64_t token) {
+  int64_t ptr_value = handle.data_ptr<int64_t>()[0];
+  auto* wrapper = reinterpret_cast<SuffixTreeWrapper*>(ptr_value);
+  wrapper->append(seq_id, token);
+}
+
+void suffix_tree_extend(torch::Tensor handle, int64_t seq_id,
+                        torch::Tensor tokens) {
+  int64_t ptr_value = handle.data_ptr<int64_t>()[0];
+  auto* wrapper = reinterpret_cast<SuffixTreeWrapper*>(ptr_value);
+
+  auto tokens_accessor = tokens.accessor<int64_t, 1>();
+  std::vector<int64_t> token_vec;
+  token_vec.reserve(tokens_accessor.size(0));
+  for (int64_t i = 0; i < tokens_accessor.size(0); ++i) {
+    token_vec.push_back(tokens_accessor[i]);
+  }
+
+  wrapper->extend(seq_id, token_vec);
+}
+
+void suffix_tree_remove(torch::Tensor handle, int64_t seq_id) {
+  int64_t ptr_value = handle.data_ptr<int64_t>()[0];
+  auto* wrapper = reinterpret_cast<SuffixTreeWrapper*>(ptr_value);
+  wrapper->remove(seq_id);
+}
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, double, int64_t>
+suffix_tree_speculate(torch::Tensor handle, torch::Tensor pattern,
+                      int64_t max_spec_tokens, double max_spec_factor,
+                      double max_spec_offset, double min_token_prob,
+                      bool use_tree_spec) {
+  int64_t ptr_value = handle.data_ptr<int64_t>()[0];
+  auto* wrapper = reinterpret_cast<SuffixTreeWrapper*>(ptr_value);
+
+  auto pattern_accessor = pattern.accessor<int64_t, 1>();
+  std::vector<int64_t> pattern_vec;
+  pattern_vec.reserve(pattern_accessor.size(0));
+  for (int64_t i = 0; i < pattern_accessor.size(0); ++i) {
+    pattern_vec.push_back(pattern_accessor[i]);
+  }
+
+  auto result =
+      wrapper->speculate(pattern_vec, max_spec_tokens, max_spec_factor,
+                         max_spec_offset, min_token_prob, use_tree_spec);
+
+  // Extract attributes from the custom object using string names
+  auto token_ids_list = result->getAttr("token_ids").toIntList();
+  auto parents_list = result->getAttr("parents").toIntList();
+  auto probs_list = result->getAttr("probs").toDoubleList();
+  double score = result->getAttr("score").toDouble();
+  int64_t match_len = result->getAttr("match_len").toInt();
+
+  // Convert c10::List to std::vector for tensor creation
+  std::vector<int64_t> token_ids_vec(token_ids_list.begin(),
+                                     token_ids_list.end());
+  std::vector<int64_t> parents_vec(parents_list.begin(), parents_list.end());
+  std::vector<double> probs_vec(probs_list.begin(), probs_list.end());
+
+  // Convert to tensors
+  auto options =
+      torch::TensorOptions().dtype(torch::kInt64).device(torch::kCPU);
+  auto float_options =
+      torch::TensorOptions().dtype(torch::kFloat64).device(torch::kCPU);
+
+  auto token_ids_tensor = torch::tensor(token_ids_vec, options);
+  auto parents_tensor = torch::tensor(parents_vec, options);
+  auto probs_tensor = torch::tensor(probs_vec, float_options);
+
+  return std::make_tuple(token_ids_tensor, parents_tensor, probs_tensor, score,
+                         match_len);
+}
+
+std::string suffix_tree_check_integrity(torch::Tensor handle) {
+  int64_t ptr_value = handle.data_ptr<int64_t>()[0];
+  auto* wrapper = reinterpret_cast<SuffixTreeWrapper*>(ptr_value);
+  return wrapper->check_integrity();
+}
+
+int64_t suffix_tree_estimate_memory(torch::Tensor handle) {
+  int64_t ptr_value = handle.data_ptr<int64_t>()[0];
+  auto* wrapper = reinterpret_cast<SuffixTreeWrapper*>(ptr_value);
+  return wrapper->estimate_memory();
+}
+
+}  // anonymous namespace
+
+TORCH_LIBRARY(_suffix_cache_C, m) {
+  // SuffixTree operations
+  m.def("suffix_tree_create(int max_depth) -> Tensor");
+  m.def("suffix_tree_destroy(Tensor handle) -> ()");
+  m.def("suffix_tree_num_seqs(Tensor handle) -> int");
+  m.def("suffix_tree_append(Tensor handle, int seq_id, int token) -> ()");
+  m.def("suffix_tree_extend(Tensor handle, int seq_id, Tensor tokens) -> ()");
+  m.def("suffix_tree_remove(Tensor handle, int seq_id) -> ()");
+  m.def(
+      "suffix_tree_speculate(Tensor handle, Tensor pattern, int "
+      "max_spec_tokens, float max_spec_factor, float max_spec_offset, float "
+      "min_token_prob, bool use_tree_spec) -> (Tensor, Tensor, Tensor, float, "
+      "int)");
+  m.def("suffix_tree_check_integrity(Tensor handle) -> str");
+  m.def("suffix_tree_estimate_memory(Tensor handle) -> int");
+}
+
+// For functions without tensor arguments, use CompositeExplicitAutograd
+TORCH_LIBRARY_IMPL(_suffix_cache_C, CompositeExplicitAutograd, m) {
+  m.impl("suffix_tree_create", &suffix_tree_create);
+}
+
+TORCH_LIBRARY_IMPL(_suffix_cache_C, CPU, m) {
+  m.impl("suffix_tree_destroy", &suffix_tree_destroy);
+  m.impl("suffix_tree_num_seqs", &suffix_tree_num_seqs);
+  m.impl("suffix_tree_append", &suffix_tree_append);
+  m.impl("suffix_tree_extend", &suffix_tree_extend);
+  m.impl("suffix_tree_remove", &suffix_tree_remove);
+  m.impl("suffix_tree_speculate", &suffix_tree_speculate);
+  m.impl("suffix_tree_check_integrity", &suffix_tree_check_integrity);
+  m.impl("suffix_tree_estimate_memory", &suffix_tree_estimate_memory);
+}
+
+REGISTER_EXTENSION(_suffix_cache_C)
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index af65b6d38e02..60a882a1bae3 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -54,7 +54,7 @@ def parse_args():
         "--method",
         type=str,
         default="eagle",
-        choices=["ngram", "eagle", "eagle3", "mtp"],
+        choices=["ngram", "eagle", "eagle3", "mtp", "suffix"],
     )
     parser.add_argument("--num-spec-tokens", type=int, default=2)
     parser.add_argument("--prompt-lookup-max", type=int, default=5)
@@ -124,6 +124,11 @@ def main(args):
             "method": "mtp",
             "num_speculative_tokens": args.num_spec_tokens,
         }
+    elif args.method == "suffix":
+        speculative_config = {
+            "method": "suffix",
+            "num_speculative_tokens": args.num_spec_tokens,
+        }
     else:
         raise ValueError(f"unknown method: {args.method}")
 
diff --git a/setup.py b/setup.py
index a8fec8a028d0..afab43a0f5e0 100644
--- a/setup.py
+++ b/setup.py
@@ -593,6 +593,7 @@ def _read_requirements(filename: str) -> list[str]:
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
+    ext_modules.append(CMakeExtension(name="vllm._suffix_cache_C"))
 
 package_data = {
     "vllm": [
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 8f048775352e..11653c00b421 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -8,7 +8,7 @@
 import pytest
 import torch
 
-from tests.utils import get_attn_backend_list_based_on_platform, large_gpu_mark
+from tests.utils import get_attn_backend_list_based_on_platform
 from vllm import LLM, SamplingParams
 from vllm.assets.base import VLLM_S3_BUCKET_URL
 from vllm.assets.image import VLM_IMAGES_DIR
@@ -125,34 +125,221 @@ def test_ngram_correctness(
     cleanup_dist_env_and_memory()
 
 
+def test_suffix_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Compare the outputs of an original LLM and a speculative LLM
+    should be the same when using suffix speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Create test prompts with repetitive patterns that suffix decode
+        # can leverage
+        test_prompts = []
+
+        # Add prompts with repetitive patterns
+        repetitive_prompts = [
+            # Code-like patterns
+            [{
+                "role":
+                "user",
+                "content":
+                "Write a Python function that prints numbers 1 to 10, "
+                "each on a new line using a for loop."
+            }],
+            [{
+                "role":
+                "user",
+                "content":
+                "Create a list of dictionaries where each dictionary "
+                "has 'id' and 'name' keys for 5 users."
+            }],
+            # Repetitive text patterns
+            [{
+                "role":
+                "user",
+                "content":
+                "List the days of the week, each followed by a colon "
+                "and the word 'workday' or 'weekend'."
+            }],
+            [{
+                "role":
+                "user",
+                "content":
+                "Generate a multiplication table for 5 "
+                "(5x1=5, 5x2=10, etc.) up to 5x5."
+            }],
+            # Template-like patterns
+            [{
+                "role":
+                "user",
+                "content":
+                "Create 3 email signatures, each with Name, Title, "
+                "Company, and Email format."
+            }],
+        ]
+
+        # Add some of the original test prompts for variety
+        original_prompts = get_test_prompts(mm_enabled=False)[:20]
+
+        test_prompts = repetitive_prompts + original_prompts
+
+        ref_llm = LLM(model=model_name,
+                      max_model_len=1024,
+                      gpu_memory_utilization=0.25)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        spec_llm = LLM(
+            model=model_name,
+            gpu_memory_utilization=0.25,
+            speculative_config={
+                "method": "suffix",
+                "num_speculative_tokens": 8,
+                "suffix_cache_max_depth": 64,
+                "suffix_cache_max_requests": 1000,
+                "suffix_min_token_prob": 0.1,
+            },
+            max_model_len=1024,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Suffix decode should maintain correctness
+        # We expect at least 66% match rate - suffix decode may have
+        # slightly different token boundaries but should produce
+        # semantically similar outputs
+        assert matches >= int(0.66 * len(ref_outputs)), \
+            f"Suffix decode correctness too low: " \
+            f"{matches}/{len(ref_outputs)} matches"
+
+        # Also ensure we have a reasonable number of matches
+        assert matches >= 15, f"Too few matches: {matches}"
+        del spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+
 @pytest.mark.parametrize(
-    ["model_setup", "mm_enabled"],
+    "suffix_config",
     [
-        (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
-        pytest.param(("eagle3", "Qwen/Qwen2.5-VL-7B-Instruct",
-                      "Rayzl/qwen2.5-vl-7b-eagle3-sgl", 1),
-                     False,
-                     marks=pytest.mark.skip(reason="Skipping due to its " \
-                               "head_dim not being a a multiple of 32")),
-        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
-        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
-        pytest.param(("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-                      "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-                     False,
-                     marks=large_gpu_mark(min_gb=80)),  # works on 4x H100
-        pytest.param(("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-                      "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-                     True,
-                     marks=large_gpu_mark(min_gb=80)),  # works on 4x H100
-        (("eagle", "eagle618/deepseek-v3-random",
-          "eagle618/eagle-deepseek-v3-random", 1), False),
-    ],
-    ids=[
-        "qwen3_eagle3", "qwen2_5_vl_eagle3", "llama3_eagle", "llama3_eagle3",
-        "llama4_eagle", "llama4_eagle_mm", "deepseek_eagle"
+        # (num_speculative_tokens, cache_max_depth, min_token_prob)
+        (4, 32, 0.1),  # Conservative configuration
+        (8, 64, 0.1),  # Default configuration
+        (16, 128, 0.05),  # Aggressive configuration
     ])
+def test_suffix_with_configs(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_name: str,
+    suffix_config: tuple[int, int, float],
+):
+    '''
+    Test suffix decode with different configurations to ensure
+    correctness is maintained across various parameter settings.
+    '''
+    num_spec_tokens, max_depth, min_prob = suffix_config
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Use a smaller set of prompts for parametrized tests
+        test_prompts = [
+            # Highly repetitive pattern
+            [{
+                "role":
+                "user",
+                "content":
+                "Count from 1 to 20, writing each number on a new line."
+            }],
+            # Code pattern
+            [{
+                "role":
+                "user",
+                "content":
+                "Write a for loop that prints 'Hello World' "
+                "5 times."
+            }],
+            # Mixed pattern
+            [{
+                "role":
+                "user",
+                "content":
+                "List three colors and their RGB values in format: "
+                "Color: R=X, G=Y, B=Z"
+            }],
+        ]
+
+        ref_llm = LLM(model=model_name, max_model_len=512)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        spec_llm = LLM(
+            model=model_name,
+            speculative_config={
+                "method": "suffix",
+                "num_speculative_tokens": num_spec_tokens,
+                "suffix_cache_max_depth": max_depth,
+                "suffix_cache_max_requests": 100,
+                "suffix_min_token_prob": min_prob,
+            },
+            max_model_len=512,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+
+        # Verify all outputs match exactly
+        for i, (ref_output,
+                spec_output) in enumerate(zip(ref_outputs, spec_outputs)):
+            assert ref_output.outputs[0].text == spec_output.outputs[0].text, \
+                f"Mismatch with config {suffix_config} on prompt {i}: " \
+                f"ref='{ref_output.outputs[0].text}' vs " \
+                f"spec='{spec_output.outputs[0].text}'"
+
+        del spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
+    (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
+    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        False,
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        True,
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+    (("eagle", "eagle618/deepseek-v3-random",
+      "eagle618/eagle-deepseek-v3-random", 1), False),
+],
+                         ids=[
+                             "qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
+                             "llama4_eagle", "llama4_eagle_mm",
+                             "deepseek_eagle"
+                         ])
 @pytest.mark.parametrize("attn_backend",
                          get_attn_backend_list_based_on_platform())
 def test_eagle_correctness(
diff --git a/tests/v1/spec_decode/test_suffix_cache.py b/tests/v1/spec_decode/test_suffix_cache.py
new file mode 100644
index 000000000000..b935e6f8b57b
--- /dev/null
+++ b/tests/v1/spec_decode/test_suffix_cache.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for suffix cache implementation."""
+
+import pytest
+
+from vllm.v1.spec_decode.suffix_decode.suffix_cache import (SuffixCache,
+                                                            SuffixSpecResult)
+
+
+class TestSuffixCache:
+    """Test suite for SuffixCache functionality."""
+
+    def test_basic_operations(self):
+        """Test basic suffix cache operations."""
+        cache = SuffixCache(max_tree_depth=32, max_cached_requests=10)
+
+        # Start a request
+        cache.start_request("req1", [1, 2, 3, 4, 5])
+        assert "req1" in cache.active_requests
+
+        # Add response tokens
+        cache.add_active_response("req1", [6, 7, 8])
+
+        # Speculate based on pattern
+        result = cache.speculate("req1", [1, 2, 3, 4, 5, 6])
+        assert isinstance(result, SuffixSpecResult)
+        assert result.token_ids == [7, 8]
+        assert result.score > 0
+
+        # Stop request
+        cache.stop_request("req1")
+        assert "req1" not in cache.active_requests
+
+    def test_multiple_requests(self):
+        """Test handling multiple concurrent requests."""
+        cache = SuffixCache(max_tree_depth=16, max_cached_requests=5)
+
+        # Start multiple requests with similar patterns
+        cache.start_request("req1", [1, 2, 3])
+        cache.start_request("req2", [1, 2, 3])
+        cache.start_request("req3", [4, 5, 6])
+
+        # Add different continuations
+        cache.add_active_response("req1", [4, 5])
+        cache.add_active_response("req2", [4, 6])
+        cache.add_active_response("req3", [7, 8])
+
+        # Test speculation for each request
+        result1 = cache.speculate("req1", [1, 2, 3, 4])
+        assert 5 in result1.token_ids
+
+        result2 = cache.speculate("req2", [1, 2, 3, 4])
+        assert 6 in result2.token_ids
+
+        result3 = cache.speculate("req3", [4, 5, 6, 7])
+        assert 8 in result3.token_ids
+
+        # Cleanup
+        cache.stop_request("req1")
+        cache.stop_request("req2")
+        cache.stop_request("req3")
+
+    def test_cache_eviction(self):
+        """Test cache eviction when max requests is reached."""
+        cache = SuffixCache(max_tree_depth=8, max_cached_requests=3)
+
+        # Fill cache
+        for i in range(4):
+            cache.start_request(f"req{i}", [i, i + 1])
+            cache.add_active_response(f"req{i}", [i + 2])
+            cache.stop_request(f"req{i}")
+
+        # Check that we have at most max_cached_requests
+        assert len(cache.cached_requests) <= 3
+
+    def test_pattern_matching(self):
+        """Test pattern matching with various lengths."""
+        cache = SuffixCache(max_tree_depth=32)
+
+        # Add a long sequence
+        cache.start_request("req1", list(range(20)))
+        cache.add_active_response("req1", list(range(20, 30)))
+
+        # Test different pattern lengths
+        # Short pattern
+        result = cache.speculate("req1", [0, 1, 2])
+        assert result.match_len == 3
+
+        # Medium pattern
+        result = cache.speculate("req1", list(range(10)))
+        assert result.match_len == 10
+
+        # Pattern extending into response
+        result = cache.speculate("req1", list(range(25)))
+        assert result.match_len == 25
+        assert result.token_ids  # Should have predictions
+
+        cache.stop_request("req1")
+
+    def test_empty_patterns(self):
+        """Test handling of empty patterns and edge cases."""
+        cache = SuffixCache(max_tree_depth=16)
+
+        # Empty prompt
+        cache.start_request("req1", [])
+        result = cache.speculate("req1", [])
+        assert result.token_ids == []
+        assert result.score == 0.0
+
+        # Add tokens and test
+        cache.add_active_response("req1", [1, 2, 3])
+        result = cache.speculate("req1", [1])
+        # Should predict at least the next token
+        assert len(result.token_ids) >= 1
+        assert result.token_ids[0] == 2
+
+        cache.stop_request("req1")
+
+    def test_invalid_operations(self):
+        """Test error handling for invalid operations."""
+        cache = SuffixCache(max_tree_depth=16)
+
+        # Speculate on non-existent request
+        with pytest.raises(ValueError, match="not active"):
+            cache.speculate("nonexistent", [1, 2, 3])
+
+        # Stop non-existent request
+        with pytest.raises(ValueError, match="not active"):
+            cache.stop_request("nonexistent")
+
+        # Start duplicate request
+        cache.start_request("req1", [1, 2, 3])
+        with pytest.raises(ValueError, match="already active"):
+            cache.start_request("req1", [4, 5, 6])
+
+        cache.stop_request("req1")
+
+    def test_max_depth_handling(self):
+        """Test that patterns longer than max_depth are handled correctly."""
+        cache = SuffixCache(max_tree_depth=8)
+
+        # Add sequence longer than max_depth
+        long_prompt = list(range(20))
+        cache.start_request("req1", long_prompt)
+        cache.add_active_response("req1", [100, 101, 102])
+
+        # Pattern longer than max_depth should be truncated
+        long_pattern = list(range(15)) + [100]
+        result = cache.speculate("req1", long_pattern)
+
+        # Should still find matches based on truncated pattern
+        assert result.token_ids
+        assert result.match_len <= 8  # Limited by max_depth
+
+        cache.stop_request("req1")
+
+    def test_speculation_parameters(self):
+        """Test different speculation parameters."""
+        cache = SuffixCache(max_tree_depth=32)
+
+        cache.start_request("req1", [1, 2, 3])
+        cache.add_active_response("req1", list(range(4, 20)))
+
+        # Test with different max_spec_tokens
+        result1 = cache.speculate("req1", [1, 2, 3, 4], max_spec_tokens=2)
+        assert len(result1.token_ids) <= 2
+
+        result2 = cache.speculate("req1", [1, 2, 3, 4], max_spec_tokens=10)
+        assert len(result2.token_ids) <= 10
+
+        # Test with different min_token_prob
+        result3 = cache.speculate("req1", [1, 2, 3, 4],
+                                  max_spec_tokens=5,
+                                  min_token_prob=0.9)
+        # With high probability threshold, might get fewer tokens
+        assert len(result3.token_ids) <= 5
+
+        cache.stop_request("req1")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/v1/spec_decode/test_suffix_tree_cpp.py b/tests/v1/spec_decode/test_suffix_tree_cpp.py
new file mode 100644
index 000000000000..5eb5d59c13e5
--- /dev/null
+++ b/tests/v1/spec_decode/test_suffix_tree_cpp.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for C++ suffix tree implementation."""
+
+import pytest
+
+from vllm.v1.spec_decode.suffix_decode import Candidate, SuffixTree
+
+
+class TestSuffixTreeCpp:
+    """Test suite for C++ SuffixTree implementation."""
+
+    def test_basic_operations(self):
+        """Test basic suffix tree operations."""
+        tree = SuffixTree(32)  # max_depth = 32
+
+        # Add sequences
+        tree.extend(0, [1, 2, 3, 4, 5])
+        tree.extend(1, [1, 2, 3, 6, 7])
+        assert tree.num_seqs() == 2
+
+        # Test speculation
+        result = tree.speculate([1, 2, 3], 5, 1.0, 0.0, 0.1, False)
+        assert isinstance(result, Candidate)
+        assert len(result.token_ids) > 0
+        assert result.score > 0
+        assert result.match_len == 3
+
+        # Remove sequence
+        tree.remove(1)
+        assert tree.num_seqs() == 1
+
+    def test_append_operations(self):
+        """Test append vs extend operations."""
+        tree = SuffixTree(16)
+
+        # Start with extend
+        tree.extend(0, [1, 2, 3])
+
+        # Append individual tokens
+        tree.append(0, 4)
+        tree.append(0, 5)
+
+        # Verify speculation works
+        result = tree.speculate([1, 2, 3, 4], 3, 1.0, 0.0, 0.1, False)
+        assert 5 in result.token_ids
+
+    def test_multiple_sequences(self):
+        """Test handling multiple sequences with shared prefixes."""
+        tree = SuffixTree(64)
+
+        # Add sequences with common prefixes
+        sequences = [
+            [1, 2, 3, 4, 5, 6],
+            [1, 2, 3, 4, 7, 8],
+            [1, 2, 3, 9, 10, 11],
+            [4, 5, 6, 7, 8, 9],
+        ]
+
+        for i, seq in enumerate(sequences):
+            tree.extend(i, seq)
+
+        # Test speculation on common prefix
+        result = tree.speculate([1, 2, 3], 3, 1.0, 0.0, 0.1, False)
+        assert len(result.token_ids) > 0
+        # Should predict one of the continuations
+        assert result.token_ids[0] in [4, 9]
+
+    def test_speculation_parameters(self):
+        """Test different speculation parameters."""
+        tree = SuffixTree(32)
+        tree.extend(0, list(range(20)))
+
+        # Test max_spec_tokens
+        result1 = tree.speculate([0, 1, 2], 3, 1.0, 0.0, 0.1, False)
+        assert len(result1.token_ids) <= 3
+
+        result2 = tree.speculate([0, 1, 2], 10, 1.0, 0.0, 0.1, False)
+        assert len(result2.token_ids) <= 10
+
+        # Test max_spec_factor and offset
+        # With factor=0.5 and match_len=3, max tokens = 3*0.5 = 1.5 -> 1
+        result3 = tree.speculate([0, 1, 2], 10, 0.5, 0.0, 0.1, False)
+        assert len(result3.token_ids) <= 2
+
+        # With offset=3, even short matches can speculate more
+        result4 = tree.speculate([0, 1], 10, 0.0, 3.0, 0.1, False)
+        assert len(result4.token_ids) <= 3
+
+    def test_integrity_check(self):
+        """Test tree integrity checking."""
+        tree = SuffixTree(16)
+
+        # Add some sequences
+        tree.extend(0, [1, 2, 3, 4])
+        tree.extend(1, [1, 2, 5, 6])
+        tree.extend(2, [7, 8, 9])
+
+        # Check integrity
+        integrity = tree.check_integrity()
+        assert integrity == "", f"Integrity check failed: {integrity}"
+
+        # Remove a sequence and check again
+        tree.remove(1)
+        integrity = tree.check_integrity()
+        assert integrity == "", f"Integrity check failed: {integrity}"
+
+    def test_memory_estimation(self):
+        """Test memory usage estimation."""
+        tree = SuffixTree(32)
+
+        # Empty tree should have minimal memory
+        initial_memory = tree.estimate_memory()
+        assert initial_memory > 0
+
+        # Add sequences and check memory increases
+        for i in range(10):
+            tree.extend(i, list(range(i, i + 20)))
+
+        final_memory = tree.estimate_memory()
+        assert final_memory > initial_memory
+
+    def test_empty_sequences(self):
+        """Test handling of empty sequences."""
+        tree = SuffixTree(16)
+
+        # Add empty sequence - C++ implementation may not track empty sequences
+        tree.extend(0, [])
+        # The C++ implementation doesn't count empty sequences
+        # This is actually reasonable behavior
+
+        # Add a non-empty sequence
+        tree.extend(1, [1, 2, 3])
+        assert tree.num_seqs() >= 1
+
+        # Speculation on empty pattern
+        result = tree.speculate([], 5, 1.0, 0.0, 0.1, False)
+        assert result.token_ids == []
+        assert result.score == 0.0
+
+    def test_large_sequences(self):
+        """Test handling of sequences larger than max_depth."""
+        max_depth = 16
+        tree = SuffixTree(max_depth)
+
+        # Add sequence longer than max_depth
+        long_seq = list(range(100))
+        tree.extend(0, long_seq)
+
+        # Pattern at the end should still work (within max_depth window)
+        pattern = list(range(80, 85))
+        result = tree.speculate(pattern, 5, 1.0, 0.0, 0.1, False)
+        assert result.token_ids == list(range(85, 90))
+
+    def test_tree_vs_path_speculation(self):
+        """Test tree-based vs path-based speculation."""
+        tree = SuffixTree(32)
+
+        # Add multiple sequences with branching
+        tree.extend(0, [1, 2, 3, 4, 5])
+        tree.extend(1, [1, 2, 3, 6, 7])
+        tree.extend(2, [1, 2, 3, 4, 8])
+
+        # Path speculation (use_tree_spec=False)
+        path_result = tree.speculate([1, 2, 3], 5, 1.0, 0.0, 0.1, False)
+
+        # Tree speculation (use_tree_spec=True)
+        tree_result = tree.speculate([1, 2, 3], 5, 1.0, 0.0, 0.1, True)
+
+        # Both should return valid results
+        assert len(path_result.token_ids) > 0
+        assert len(tree_result.token_ids) > 0
+
+        # Tree speculation might return different structure
+        # (multiple branches vs single path)
+        assert path_result.parents[0] == -1  # First token has no parent
+        if len(tree_result.token_ids) > 1:
+            # Check tree structure is valid
+            for i, parent in enumerate(tree_result.parents):
+                assert parent < i  # Parent index must be less than current
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index ccb91999d370..a808f18c99df 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -685,7 +685,7 @@ def try_verify_and_update_config(self):
                                  f"Model: {self.model_config.model}")
 
     def compile_debug_dump_path(self) -> Optional[Path]:
-        """Returns a rank-aware path for dumping 
+        """Returns a rank-aware path for dumping
         torch.compile debug information.
         """
         if self.compilation_config.debug_dump_path is None:
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index cb4f0ae2cee0..5817540cf486 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -32,7 +32,7 @@
 SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
                             "mlp_speculator", "draft_model", "deepseek_mtp",
                             "ernie_mtp", "qwen3_next_mtp", "mimo_mtp",
-                            "longcat_flash_mtp", "mtp"]
+                            "longcat_flash_mtp", "mtp", "suffix"]
 MTP_MODEL_TYPES = ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp", "ernie_mtp",
                    "qwen3_next_mtp", "longcat_flash_mtp")
 
@@ -100,6 +100,30 @@ class SpeculativeConfig:
     """Minimum size of ngram token window when using Ngram proposer, if
     provided. Defaults to 1."""
 
+    # Suffix decode configuration
+    suffix_cache_max_depth: int = 64
+    """Maximum depth of the suffix trees."""
+    suffix_cache_max_requests: int = 100000
+    """Maximum number of cached requests. When this limit is reached, the
+    least recently used inactive requests are evicted from the cache using
+    FIFO order."""
+    suffix_max_spec_factor: float = 1.0
+    """Factor that dynamically limits speculation based on matched pattern
+    length. The actual max tokens speculated is calculated as:
+    min(num_speculative_tokens,
+        match_length * suffix_max_spec_factor + suffix_max_spec_offset).
+    Higher values allow more aggressive speculation when longer patterns
+    are matched. Example: With factor=1.5 and a 10-token match, up to
+    15 tokens can be speculated."""
+    suffix_max_spec_offset: float = 0.0
+    """Offset added to the dynamic speculation limit calculation.
+    This provides a minimum number of tokens that can be speculated even
+    with short pattern matches. Works in conjunction with
+    suffix_max_spec_factor. Example: With offset=2.0, at least 2 tokens
+    can be speculated even with a 1-token match."""
+    suffix_min_token_prob: float = 0.6
+    """Minimum estimated probability threshold for candidate tokens."""
+
     speculative_token_tree: Optional[str] = None
     """Specifies the tree structure for speculative token generation.
     """
@@ -227,6 +251,8 @@ def __post_init__(self):
                     self.quantization = self.target_model_config.quantization
             elif self.method in ("ngram", "[ngram]"):
                 self.model = "ngram"
+            elif self.method == "suffix":
+                self.model = "suffix"
             else:
                 raise ValueError(
                     "num_speculative_tokens was provided but without "
@@ -265,7 +291,30 @@ def __post_init__(self):
                 raise ValueError(
                     f"prompt_lookup_min={self.prompt_lookup_min} must "
                     f"be <= prompt_lookup_max={self.prompt_lookup_max}")
-
+        elif self.method == "suffix":
+            # Validate suffix decode parameters
+            if self.suffix_cache_max_depth < 1:
+                raise ValueError(
+                    f"suffix_cache_max_depth={self.suffix_cache_max_depth} "
+                    "must be greater than or equal to 1")
+            if (self.suffix_cache_max_requests is not None
+                    and self.suffix_cache_max_requests < 1):
+                raise ValueError(f"suffix_cache_max_requests="
+                                 f"{self.suffix_cache_max_requests} "
+                                 "must be greater than or equal to 1")
+            if self.suffix_max_spec_factor < 0:
+                raise ValueError(f"suffix_max_spec_factor="
+                                 f"{self.suffix_max_spec_factor} "
+                                 "must be greater than or equal to 0")
+            if self.suffix_max_spec_offset < 0:
+                raise ValueError(f"suffix_max_spec_offset="
+                                 f"{self.suffix_max_spec_offset} "
+                                 "must be greater than or equal to 0")
+            if (self.suffix_min_token_prob < 0
+                    or self.suffix_min_token_prob > 1):
+                raise ValueError(f"suffix_min_token_prob="
+                                 f"{self.suffix_min_token_prob} "
+                                 "must be in range [0, 1]")
             # TODO: current we still need extract vocab_size from target model
             # config, in future, we may try refactor it out, and set
             # draft related config as None here.
@@ -557,6 +606,7 @@ def use_eagle(self) -> bool:
 
     def __repr__(self) -> str:
         method = self.method
-        model = None if method == "ngram" else self.draft_model_config.model
+        model = None if method in ("ngram",
+                                   "suffix") else self.draft_model_config.model
         num_spec_tokens = self.num_speculative_tokens
         return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
diff --git a/vllm/v1/spec_decode/suffix_decode/__init__.py b/vllm/v1/spec_decode/suffix_decode/__init__.py
new file mode 100644
index 000000000000..efe3f273e3a4
--- /dev/null
+++ b/vllm/v1/spec_decode/suffix_decode/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2025 Snowflake Inc.
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .suffix_cache import Candidate, SuffixCache, SuffixSpecResult, SuffixTree
+
+__all__ = ["SuffixCache", "SuffixSpecResult", "Candidate", "SuffixTree"]
diff --git a/vllm/v1/spec_decode/suffix_decode/suffix_cache.py b/vllm/v1/spec_decode/suffix_decode/suffix_cache.py
new file mode 100644
index 000000000000..4dcbb9f7604b
--- /dev/null
+++ b/vllm/v1/spec_decode/suffix_decode/suffix_cache.py
@@ -0,0 +1,441 @@
+# Copyright 2025 Snowflake Inc.
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from collections.abc import Hashable, KeysView
+from dataclasses import dataclass, field
+from typing import NamedTuple, Optional, Union
+
+import torch
+
+import vllm._suffix_cache_C  # noqa: F401
+
+
+class Candidate(NamedTuple):
+    """Result of suffix tree speculation from C++ implementation."""
+    token_ids: list[int]
+    parents: list[int]
+    probs: list[float]
+    score: float
+    match_len: int
+
+
+class SuffixTree:
+    """Python wrapper for the C++ SuffixTree implementation."""
+
+    def __init__(self, max_depth: int):
+        """Initialize a new suffix tree.
+
+        Args:
+            max_depth: Maximum depth of the suffix tree.
+        """
+        self._handle = torch.ops._suffix_cache_C.suffix_tree_create(max_depth)
+        self._destroyed = False
+
+    def __del__(self):
+        """Clean up the C++ suffix tree object."""
+        if hasattr(self, '_handle') and not self._destroyed:
+            torch.ops._suffix_cache_C.suffix_tree_destroy(self._handle)
+            self._destroyed = True
+
+    def num_seqs(self) -> int:
+        """Get the number of sequences in the suffix tree."""
+        return int(torch.ops._suffix_cache_C.suffix_tree_num_seqs(
+            self._handle))
+
+    def append(self, seq_id: int, token: int) -> None:
+        """Append a new element to the sequence with id seq_id.
+
+        Args:
+            seq_id: ID of the sequence to append to.
+            token: Token to append.
+        """
+        torch.ops._suffix_cache_C.suffix_tree_append(self._handle, seq_id,
+                                                     token)
+
+    def extend(self, seq_id: int, tokens: list[int]) -> None:
+        """Append multiple new elements to the sequence with id seq_id.
+
+        Args:
+            seq_id: ID of the sequence to extend.
+            tokens: list of tokens to append.
+        """
+        tokens_tensor = torch.tensor(tokens, dtype=torch.int64)
+        torch.ops._suffix_cache_C.suffix_tree_extend(self._handle, seq_id,
+                                                     tokens_tensor)
+
+    def remove(self, seq_id: int) -> None:
+        """Remove the sequence with id seq_id.
+
+        Args:
+            seq_id: ID of the sequence to remove.
+        """
+        torch.ops._suffix_cache_C.suffix_tree_remove(self._handle, seq_id)
+
+    def speculate(self,
+                  pattern: list[int],
+                  max_spec_tokens: int,
+                  max_spec_factor: float = 1.0,
+                  max_spec_offset: float = 0.0,
+                  min_token_prob: float = 0.1,
+                  use_tree_spec: bool = False) -> Candidate:
+        """Given a pattern, speculate the next tokens using the suffix tree.
+
+        Args:
+            pattern: The pattern to match.
+            max_spec_tokens: Maximum number of tokens to speculate.
+            max_spec_factor: Maximum speculation factor.
+            max_spec_offset: Maximum speculation offset.
+            min_token_prob: Minimum token probability threshold.
+            use_tree_spec: Whether to use tree-based speculation.
+
+        Returns:
+            Candidate object containing speculation results.
+        """
+        pattern_tensor = torch.tensor(pattern, dtype=torch.int64)
+
+        token_ids, parents, probs, score, match_len = \
+            torch.ops._suffix_cache_C.suffix_tree_speculate(
+                self._handle, pattern_tensor, max_spec_tokens, max_spec_factor,
+                max_spec_offset, min_token_prob, use_tree_spec)
+
+        return Candidate(token_ids=token_ids.tolist(),
+                         parents=parents.tolist(),
+                         probs=probs.tolist(),
+                         score=float(score),
+                         match_len=int(match_len))
+
+    def check_integrity(self) -> str:
+        """Check the integrity of the suffix tree.
+
+        Returns:
+            Empty string if ok, otherwise an error message.
+        """
+        return torch.ops._suffix_cache_C.suffix_tree_check_integrity(
+            self._handle)
+
+    def estimate_memory(self) -> int:
+        """Estimate memory usage of the suffix tree.
+
+        Note: This walks the entire tree so can be slow.
+
+        Returns:
+            Estimated memory usage in bytes.
+        """
+        return int(
+            torch.ops._suffix_cache_C.suffix_tree_estimate_memory(
+                self._handle))
+
+
+@dataclass
+class SuffixSpecResult:
+    """
+    A dataclass representing the result of a speculation using SuffixDecoding.
+
+    Attributes:
+        token_ids (list[int]): list of token IDs in the speculation result.
+        parents (list[int]): list of parent indices for each token used to
+            encode the tree structure. The parent token of token_ids[i] is
+            token_ids[parents[i]].
+        probs (list[float]): list of estimated probabilities for each token.
+        score (float): The overall score of the suffix match computed as the
+            sum of the estimated probabilities of each speculated token.
+        match_len (int): The length of the pattern match that yielded this
+            speculation result.
+    """
+    token_ids: list[int] = field(default_factory=list)
+    parents: list[int] = field(default_factory=list)
+    probs: list[float] = field(default_factory=list)
+    score: float = 0.0
+    match_len: int = 0
+
+    @staticmethod
+    def from_candidate(candidate: Candidate) -> SuffixSpecResult:
+        return SuffixSpecResult(
+            token_ids=candidate.token_ids,
+            parents=candidate.parents,
+            probs=candidate.probs,
+            score=candidate.score,
+            match_len=candidate.match_len,
+        )
+
+
+class SuffixCache:
+
+    def __init__(self,
+                 max_tree_depth: int = 64,
+                 max_cached_requests: int = 1000):
+        """
+        Initialize the SuffixCache.
+
+        Args:
+            max_tree_depth (int): The maximum depth of the suffix trees.
+            max_cached_requests (int, optional): The maximum number of cached
+                requests. Cache eviction is used when the limit is reached. If
+                `None`, there is no limit on the number of cached requests.
+        """
+        self._max_tree_depth = max_tree_depth
+        self._max_cached_requests = max_cached_requests
+
+        # Global suffix tree caches previous responses in a single tree.
+        self._global_tree = SuffixTree(max_tree_depth)
+
+        # Local suffix trees cache prompts for each active request separately.
+        self._local_trees: dict[Hashable, SuffixTree] = {}
+
+        # Maps between Python request ID and int32_t sequence ID. Tracks all
+        # request IDs that are in the global tree or one of the local trees.
+        self._req_to_seq_id: dict[Hashable, int] = {}
+        self._seq_to_req_id: dict[int, Hashable] = {}
+
+        # Unused sequence ID to assign to a new request ID.
+        self._next_seq_id = 0
+
+    @property
+    def max_tree_depth(self) -> int:
+        return self._max_tree_depth
+
+    @property
+    def max_cached_requests(self) -> int:
+        return self._max_cached_requests
+
+    @property
+    def active_requests(self) -> KeysView:
+        """
+        Returns a view of the currently active request IDs. Active requests are
+        those that have been started via `start_request` and not yet stopped
+        via `stop_request`. The prompts of active requests are stored so they
+        can be used during speculation for the same request.
+        """
+        return self._local_trees.keys()
+
+    @property
+    def cached_requests(self) -> KeysView:
+        """
+        Returns a view of all request IDs that have their responses cached in
+        the global suffix tree. The response for the cached request can be used
+        during speculation for other requests, until the response is evicted.
+        """
+        return self._req_to_seq_id.keys()
+
+    def start_request(self, req_id: Hashable, prompt_token_ids: list[int]):
+        """
+        This method should be called when starting to process a new request. It
+        will store the prompt for the request, allowing future speculations for
+        the same request to use the prompt context. The prompt will be stored
+        until `stop_request` is called.
+
+        Args:
+            req_id (Hashable): The request identifier. Must be a hashable value
+                that uniquely identifies the request.
+            prompt_token_ids (list[int]): A sequence of token IDs
+                representing the prompt of the request.
+
+        Raises:
+            ValueError: If a request with the same `req_id` is already active
+                or cached.
+        """
+        if req_id in self._req_to_seq_id:
+            raise ValueError(f"Request '{req_id}' is already active or cached")
+        seq_id = self._generate_seq_id(req_id)
+        self._local_trees[req_id] = SuffixTree(self._max_tree_depth)
+        self._local_trees[req_id].extend(seq_id, prompt_token_ids)
+
+    def stop_request(self, req_id: Hashable):
+        """
+        This method should be called when a request is completed. It will evict
+        the prompt for the request, freeing up memory.
+
+        Args:
+            req_id (Hashable): The request identifier. Must be a hashable value
+                that uniquely identifies the request.
+
+        Raises:
+            ValueError: If the request with the given `req_id` is not active.
+        """
+        if req_id not in self._local_trees:
+            raise ValueError(f"Request '{req_id}' is not active")
+        del self._local_trees[req_id]
+
+    def add_active_response(
+        self,
+        req_id: Hashable,
+        token_ids: Union[int, list[int]],
+    ):
+        """
+        Update the cached response for a given request by appending token(s) to
+        its end. Once the response is updated, the new tokens can be used for
+        future speculations for all requests.
+
+        Args:
+            req_id (Hashable): The unique identifier for the request.
+            token_ids (Union[int, list[int]]): Either a single token ID
+                (int) or a sequence of token IDs to be appended to the response
+                for the given request.
+
+        Raises:
+            ValueError: If the request with the given `req_id` is not active.
+        """
+        if req_id not in self._local_trees:
+            raise ValueError(f"Request '{req_id}' is not active")
+        seq_id = self._req_to_seq_id[req_id]
+        if isinstance(token_ids, int):
+            self._global_tree.append(seq_id, token_ids)
+            self._local_trees[req_id].append(seq_id, token_ids)
+        else:
+            self._global_tree.extend(seq_id, token_ids)
+            self._local_trees[req_id].extend(seq_id, token_ids)
+
+    def insert_new_response(
+        self,
+        req_id: Hashable,
+        token_ids: list[int],
+    ):
+        """
+        Insert a complete response to the global cache for a request that is
+        not active and is not already cached.
+
+        Args:
+            req_id (Hashable): The unique identifier for the request.
+            token_ids (list[int]): A sequence of token IDs to be inserted
+                as the response for the given request.
+
+        Raises:
+            ValueError: If a request with the same `req_id` is already active
+                or cached.
+        """
+        if req_id in self._req_to_seq_id:
+            raise ValueError(f"Request '{req_id}' is already active or cached")
+        seq_id = self._generate_seq_id(req_id)
+        self._global_tree.extend(seq_id, token_ids)
+
+    def evict_request(self, req_id: Hashable):
+        """
+        Evicts the given request's prompt and response from the cache. If the
+        request is active, it becomes inactive. The `req_id` can then be reused
+        after eviction.
+
+        Args:
+            req_id (Hashable): The unique identifier for the request that
+                should be evicted.
+
+        Raises:
+            ValueError: If no response exists for the given request identifier.
+        """
+        if req_id not in self._req_to_seq_id:
+            raise ValueError(f"Request '{req_id}' is not active or cached")
+        if req_id in self._local_trees:
+            del self._local_trees[req_id]
+        seq_id = self._req_to_seq_id.pop(req_id)
+        self._seq_to_req_id.pop(seq_id)
+        self._global_tree.remove(seq_id)
+
+    def speculate(
+        self,
+        req_id: Hashable,
+        pattern: list[int],
+        max_spec_tokens: Optional[int] = None,
+        max_spec_factor: float = 1.0,
+        max_spec_offset: float = 0.0,
+        min_token_prob: float = 0.1,
+        use_tree_spec: bool = False,
+    ) -> SuffixSpecResult:
+        """
+        Speculates and returns the most likely continuation of a given token
+        pattern using the request's prompt and the global cache of previous
+        responses. This method can only be called for active requests (i.e.
+        after calling `start_request` and before calling `stop_request`).
+
+        Args:
+            req_id (Hashable): The unique identifier for the request.
+            pattern (list[int]): The sequence of token IDs to match and
+                continue from.
+            max_spec_tokens (int): Maximum number of tokens to speculate. If 0,
+                uses the cache's max_depth.
+            max_spec_factor (float): Factor that limits speculation based on
+                matched pattern length.
+            min_token_prob (float): Minimum estimated probability threshold for
+                candidate tokens.
+            use_tree_spec (bool): If True, uses tree-based speculation.
+
+        Returns:
+            The speculation result containing the most likely continuation
+            tokens, their probabilities, and overall score.
+
+        Raises:
+            ValueError: If the request with the given `req_id` is not active.
+        """
+        if req_id not in self._local_trees:
+            raise ValueError(f"Request '{req_id}' is not active")
+
+        if max_spec_tokens is None:
+            max_spec_tokens = self._max_tree_depth
+
+        if len(pattern) > self._max_tree_depth:
+            pattern = pattern[-self._max_tree_depth:]
+
+        candidate = self._local_trees[req_id].speculate(
+            pattern, max_spec_tokens, max_spec_factor, max_spec_offset,
+            min_token_prob, use_tree_spec)
+        result = SuffixSpecResult.from_candidate(candidate)
+
+        candidate = self._global_tree.speculate(pattern, max_spec_tokens,
+                                                max_spec_factor,
+                                                max_spec_offset,
+                                                min_token_prob, use_tree_spec)
+        if candidate.score > result.score:
+            result = SuffixSpecResult.from_candidate(candidate)
+
+        return result
+
+    def _generate_seq_id(self, req_id: Hashable) -> int:
+        # Find the next available seq_id not used by an active request.
+        while True:
+            seq_id = self._next_seq_id
+            # Increment to the next non-negative int32_t value.
+            self._next_seq_id = (self._next_seq_id + 1) & 0x7FFFFFFF
+            if (seq_id not in self._seq_to_req_id
+                    or self._seq_to_req_id[seq_id] not in self._local_trees):
+                break
+        # Check if the seq_id is used by an inactive but cached request.
+        if seq_id in self._seq_to_req_id:
+            # This seq_id is already used, should be a very rare case that
+            # only happens when the seq_id has wrapped around and collided.
+            # We evict the old cached request to free up the seq_id.
+            del self._req_to_seq_id[self._seq_to_req_id[seq_id]]
+            del self._seq_to_req_id[seq_id]
+            self._global_tree.remove(seq_id)
+        # Allocate the seq_id to the new req_id.
+        self._req_to_seq_id[req_id] = seq_id
+        self._seq_to_req_id[seq_id] = req_id
+        self._maybe_evict_requests(seq_id)
+        return seq_id
+
+    def _maybe_evict_requests(self, new_seq_id: int):
+        if self._max_cached_requests is None:
+            return
+        while len(self._req_to_seq_id) > self._max_cached_requests:
+            # Evict the first request that is not active. Should be FIFO order
+            # in python 3.7+ as dict preserves insertion order. We also want to
+            # avoid evicting the request that was just added (new_seq_id).
+            for req_id, seq_id in self._req_to_seq_id.items():
+                if seq_id != new_seq_id and req_id not in self._local_trees:
+                    self.evict_request(req_id)
+                    break
+            else:
+                # All previously cached requests are active, cannot evict any.
+                break
diff --git a/vllm/v1/spec_decode/suffix_proposer.py b/vllm/v1/spec_decode/suffix_proposer.py
new file mode 100644
index 000000000000..e738621cf8bc
--- /dev/null
+++ b/vllm/v1/spec_decode/suffix_proposer.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Hashable
+from typing import Optional
+
+import numpy as np
+
+from vllm.config import VllmConfig
+from vllm.v1.spec_decode.suffix_decode.suffix_cache import SuffixCache
+
+
+class SuffixProposer:
+    """Proposer for suffix-decoding based speculative decoding."""
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.spec_config = vllm_config.speculative_config
+        self.vllm_config = vllm_config
+
+        # Initialize suffix cache with configuration parameters
+        self._suffix_cache = SuffixCache(
+            max_tree_depth=self.spec_config.suffix_cache_max_depth,
+            max_cached_requests=self.spec_config.suffix_cache_max_requests)
+
+        self.max_spec_tokens = self.spec_config.num_speculative_tokens
+        self.max_spec_factor = self.spec_config.suffix_max_spec_factor
+        self.max_spec_offset = self.spec_config.suffix_max_spec_offset
+        self.min_token_prob = self.spec_config.suffix_min_token_prob
+
+        # Track active requests
+        self._active_requests: set[Hashable] = set()
+
+    def start_request(self, req_id: Hashable, prompt_token_ids: list[int]):
+        """Start tracking a new request."""
+        if req_id not in self._active_requests:
+            self._suffix_cache.start_request(req_id, prompt_token_ids)
+            self._active_requests.add(req_id)
+
+    def stop_request(self, req_id: Hashable):
+        """Stop tracking a request."""
+        if req_id in self._active_requests:
+            self._suffix_cache.stop_request(req_id)
+            self._active_requests.remove(req_id)
+
+    def update_response(self, req_id: Hashable, token_ids: list[int]):
+        """Update the cached response for a request."""
+        if req_id in self._active_requests:
+            self._suffix_cache.add_active_response(req_id, token_ids)
+
+    def propose(self,
+                context_token_ids: np.ndarray,
+                req_id: Optional[Hashable] = None) -> Optional[np.ndarray]:
+        """Propose speculative tokens based on suffix matching."""
+        if req_id is None or req_id not in self._active_requests:
+            # If no request ID or not an active request, return empty proposal
+            return None
+
+        # Convert numpy array to list for pattern matching
+        pattern = context_token_ids.tolist()
+
+        # Get speculation result from suffix cache
+        result = self._suffix_cache.speculate(
+            req_id=req_id,
+            pattern=pattern,
+            max_spec_tokens=self.max_spec_tokens,
+            max_spec_factor=self.max_spec_factor,
+            max_spec_offset=self.max_spec_offset,
+            min_token_prob=self.min_token_prob,
+            use_tree_spec=False  # TODO: Add configuration for tree speculation
+        )
+
+        if result.token_ids:
+            return np.array(result.token_ids, dtype=np.int32)
+        else:
+            return None
+
+    def load_model(self, *args, **kwargs):
+        # No model to load for suffix-decode based speculative decoding
+        pass
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f8b0b9cba1bc..93cec0f86e41 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -95,6 +95,7 @@
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.spec_decode.suffix_proposer import SuffixProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -154,7 +155,7 @@ def __init__(
 
     def get_output(self) -> ModelRunnerOutput:
         """Copy the device tensors to the host and return a ModelRunnerOutput.
-        
+
         This function blocks until the copy is finished.
         """
         self._async_copy_ready_event.synchronize()
@@ -281,6 +282,8 @@ def __init__(
         if self.speculative_config and get_pp_group().is_last_rank:
             if self.speculative_config.method == "ngram":
                 self.drafter = NgramProposer(self.vllm_config)
+            elif self.speculative_config.method == "suffix":
+                self.drafter = SuffixProposer(self.vllm_config)  # type: ignore
             elif self.speculative_config.use_eagle():
                 self.drafter = EagleProposer(self.vllm_config, self.device,
                                              self)  # type: ignore
@@ -552,6 +555,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            # Stop tracking request in suffix cache if using suffix proposer
+            if (self.speculative_config
+                    and self.speculative_config.method == "suffix"):
+                assert isinstance(self.drafter, SuffixProposer)
+                self.drafter.stop_request(req_id)
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
         # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -618,6 +626,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             )
             self.requests[req_id] = req_state
 
+            # Start tracking request in suffix cache if using suffix proposer
+            if (self.speculative_config
+                    and self.speculative_config.method == "suffix"):
+                assert isinstance(self.drafter, SuffixProposer)
+                self.drafter.start_request(req_id,
+                                           new_req_data.prompt_token_ids)
+
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             if self.uses_mrope:
                 self._init_mrope_positions(req_state)
@@ -846,7 +861,7 @@ def _get_cumsum_and_arange(
     def _prepare_input_ids(self, total_num_scheduled_tokens: int,
                            cu_num_tokens: np.ndarray) -> None:
         """Prepare the input IDs for the current batch.
-        
+
         Carefully handles the `prev_sampled_token_ids` which can be cached
         from the previous engine iteration, in which case those tokens on the
         GPU need to be copied into the corresponding slots into input_ids."""
@@ -2243,6 +2258,12 @@ def _bookkeeping_sync(
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
+            # Update suffix cache if using suffix proposer
+            if (self.speculative_config
+                    and self.speculative_config.method == "suffix"):
+                assert isinstance(self.drafter, SuffixProposer)
+                self.drafter.update_response(req_id, sampled_ids)
+
         return (
             num_nans_in_logits,
             logprobs_lists,
@@ -2279,7 +2300,7 @@ def _model_forward(
         """Helper method to call the model forward pass.
 
         This method can be overridden by subclasses for model execution.
-        Motivation: We can inspect only this method versus 
+        Motivation: We can inspect only this method versus
         the whole execute_model, which has additional logic.
 
         Args:
@@ -2549,6 +2570,10 @@ def propose_draft_token_ids(
                 self.input_batch.num_tokens_no_spec,
                 self.input_batch.token_ids_cpu,
                 self.input_batch.spec_decode_unsupported_reqs)
+        elif self.speculative_config.method == "suffix":
+            assert isinstance(self.drafter, SuffixProposer)
+            draft_token_ids = self.propose_suffix_draft_token_ids(
+                sampled_token_ids)
         elif self.speculative_config.method == "medusa":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, MedusaProposer)
@@ -2661,6 +2686,41 @@ def propose_draft_token_ids(
 
         return draft_token_ids
 
+    def propose_suffix_draft_token_ids(
+        self,
+        sampled_token_ids: list[list[int]],
+    ) -> list[list[int]]:
+        req_ids = self.input_batch.req_ids
+        draft_token_ids: list[list[int]] = []
+        assert isinstance(self.drafter, SuffixProposer)
+        for i, sampled_ids in enumerate(sampled_token_ids):
+            num_sampled_ids = len(sampled_ids)
+            if not num_sampled_ids:
+                # Skip speculative decoding.
+                draft_token_ids.append([])
+                continue
+
+            # Skip requests that require sampling parameters that are not
+            # supported with speculative decoding.
+            req_id = req_ids[i]
+            if req_id in self.input_batch.spec_decode_unsupported_reqs:
+                draft_token_ids.append([])
+                continue
+
+            num_tokens = self.input_batch.num_tokens_no_spec[i]
+            if num_tokens >= self.max_model_len:
+                # Skip requests that have already reached the max model length.
+                draft_token_ids.append([])
+                continue
+
+            drafter_output = self.drafter.propose(
+                self.input_batch.token_ids_cpu[i, :num_tokens], req_id=req_id)
+            if drafter_output is None or len(drafter_output) == 0:
+                draft_token_ids.append([])
+            else:
+                draft_token_ids.append(drafter_output.tolist())
+        return draft_token_ids
+
     def update_config(self, overrides: dict[str, Any]) -> None:
         allowed_config_names = {"load_config", "model_config"}
         for config_name, config_overrides in overrides.items():
@@ -3673,7 +3733,7 @@ def create_attn_groups(
 
     def initialize_cudagraph_capture(self) -> None:
         """
-        Resolve the cudagraph_mode when there are multiple attention 
+        Resolve the cudagraph_mode when there are multiple attention
         backends with potential conflicting CUDA graph support.
         Then initialize the cudagraph_dispatcher based on the resolved
         cudagraph_mode.