mlc-ai
diff --git a/‎cpp/metadata/model.cc‎
Lines changed: 12 additions & 3 deletions b/‎cpp/metadata/model.cc‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎cpp/metadata/model.h‎
Lines changed: 31 additions & 0 deletions b/‎cpp/metadata/model.h‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎cpp/serve/config.cc‎
Lines changed: 6 additions & 7 deletions b/‎cpp/serve/config.cc‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎cpp/serve/config.h‎
Lines changed: 27 additions & 29 deletions b/‎cpp/serve/config.h‎
Lines changed: 27 additions & 29 deletions
diff --git a/‎cpp/serve/engine.cc‎
Lines changed: 15 additions & 24 deletions b/‎cpp/serve/engine.cc‎
Lines changed: 15 additions & 24 deletions
diff --git a/‎cpp/serve/engine_actions/action_commons.cc‎
Lines changed: 20 additions & 0 deletions b/‎cpp/serve/engine_actions/action_commons.cc‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎cpp/serve/engine_actions/action_commons.h‎
Lines changed: 18 additions & 0 deletions b/‎cpp/serve/engine_actions/action_commons.h‎
Lines changed: 18 additions & 0 deletions
@@ -63,8 +63,17 @@ ModelMetadata ModelMetadata::FromJSON(const picojson::object& metadata,
   if (metadata.count("attention_sink_size"))  // remove after sink is decoupled from model lib
     result.attention_sink_size = json::Lookup<int64_t>(metadata, "attention_sink_size");
   result.tensor_parallel_shards = json::Lookup<int64_t>(metadata, "tensor_parallel_shards");
-  result.kv_cache_metadata =
-      KVCacheMetadata::FromJSON(json::Lookup<picojson::object>(metadata, "kv_cache"));
+  result.kv_state_kind = KVStateKindFromString(
+      json::LookupOrDefault<std::string>(metadata, "kv_state_kind", "kv_cache"));
+  if (result.kv_state_kind != KVStateKind::kNone) {
+    result.kv_cache_metadata =
+        KVCacheMetadata::FromJSON(json::Lookup<picojson::object>(metadata, "kv_cache"));
+  } else {
+    result.kv_cache_metadata = {/*num_hidden_layers=*/0,
+                                /*head_dim=*/0,
+                                /*num_attention_heads=*/0,
+                                /*num_key_value_heads=*/0};
+  }
   {
     std::vector<ModelMetadata::Param>& params = result.params;
     picojson::array json_params = json::Lookup<picojson::array>(metadata, "params");
@@ -94,7 +103,7 @@ ModelMetadata ModelMetadata::FromModule(tvm::runtime::Module module,
   try {
     return ModelMetadata::FromJSON(json, model_config);
   } catch (const std::exception& e) {
-    LOG(WARNING) << "Failed to parse metadata:\n" << json_str;
+    LOG(WARNING) << "Failed to parse metadata:\n" << json_str << "\nerror: " << e.what();
     throw e;
   }
 }
 
@@ -16,6 +16,36 @@
 namespace mlc {
 namespace llm {
 
+/*! \brief The kind of cache. */
+enum class KVStateKind : int {
+  kKVCache = 0,
+  kRNNState = 1,
+  kNone = 2,
+};
+
+inline std::string KVStateKindToString(KVStateKind kv_state_kind) {
+  if (kv_state_kind == KVStateKind::kKVCache) {
+    return "kv_cache";
+  } else if (kv_state_kind == KVStateKind::kRNNState) {
+    return "rnn_state";
+  } else if (kv_state_kind == KVStateKind::kNone) {
+    return "none";
+  } else {
+    LOG(FATAL) << "Invalid kv state kind: " << static_cast<int>(kv_state_kind);
+  }
+}
+
+inline KVStateKind KVStateKindFromString(const std::string& kv_state_kind) {
+  if (kv_state_kind == "kv_cache") {
+    return KVStateKind::kKVCache;
+  } else if (kv_state_kind == "rnn_state") {
+    return KVStateKind::kRNNState;
+  } else if (kv_state_kind == "none") {
+    return KVStateKind::kNone;
+  } else {
+    LOG(FATAL) << "Invalid kv state kind string: " << kv_state_kind;
+  }
+}
 struct ModelMetadata {
   struct Param {
     struct Preproc {
@@ -49,6 +79,7 @@ struct ModelMetadata {
   int64_t attention_sink_size;
   std::vector<Param> params;
   std::unordered_map<std::string, int64_t> memory_usage;
+  KVStateKind kv_state_kind;
   KVCacheMetadata kv_cache_metadata;
 
   static ModelMetadata FromJSON(const picojson::object& json_str,
 
@@ -248,7 +248,6 @@ EngineConfig EngineConfig::FromJSONAndInferredConfig(
   CHECK(inferred_config.max_single_sequence_length.has_value());
   CHECK(inferred_config.prefill_chunk_size.has_value());
   CHECK(inferred_config.max_history_size.has_value());
-  CHECK(inferred_config.kv_state_kind.has_value());
   ObjectPtr<EngineConfigNode> n = make_object<EngineConfigNode>();
 
   // - Get models and model libs.
@@ -290,7 +289,6 @@ EngineConfig EngineConfig::FromJSONAndInferredConfig(
   n->max_single_sequence_length = inferred_config.max_single_sequence_length.value();
   n->prefill_chunk_size = inferred_config.prefill_chunk_size.value();
   n->max_history_size = inferred_config.max_history_size.value();
-  n->kv_state_kind = inferred_config.kv_state_kind.value();
 
   return EngineConfig(n);
 }
@@ -356,7 +354,6 @@ String EngineConfigNode::AsJSONString() const {
       picojson::value(static_cast<int64_t>(this->max_single_sequence_length));
   config["prefill_chunk_size"] = picojson::value(static_cast<int64_t>(this->prefill_chunk_size));
   config["max_history_size"] = picojson::value(static_cast<int64_t>(this->max_history_size));
-  config["kv_state_kind"] = picojson::value(KVStateKindToString(this->kv_state_kind));
   config["speculative_mode"] = picojson::value(SpeculativeModeToString(this->speculative_mode));
   config["spec_draft_length"] = picojson::value(static_cast<int64_t>(this->spec_draft_length));
   config["verbose"] = picojson::value(static_cast<bool>(this->verbose));
@@ -428,14 +425,18 @@ Result<ModelConfigLimits> GetModelConfigLimits(const std::vector<picojson::objec
           ") is larger than the prefill chunk size used at compile time (" +
           std::to_string(compile_time_prefill_chunk_size) + ").");
     }
-    model_max_prefill_chunk_size =
-        std::min(model_max_prefill_chunk_size, runtime_prefill_chunk_size);
+    if (runtime_prefill_chunk_size != -1) {
+      model_max_prefill_chunk_size =
+          std::min(model_max_prefill_chunk_size, runtime_prefill_chunk_size);
+    }
     // - The maximum batch size is the minimum max batch size among all models.
     model_max_batch_size = std::min(
         model_max_batch_size, json::Lookup<int64_t>(compile_time_model_config, "max_batch_size"));
   }
   ICHECK_NE(model_max_prefill_chunk_size, std::numeric_limits<int64_t>::max());
   ICHECK_NE(model_max_batch_size, std::numeric_limits<int64_t>::max());
+  ICHECK_GT(model_max_prefill_chunk_size, 0);
+  ICHECK_GT(model_max_batch_size, 0);
   return Result<ModelConfigLimits>::Ok(
       {model_max_single_sequence_length, model_max_prefill_chunk_size, model_max_batch_size});
 }
@@ -689,7 +690,6 @@ Result<InferrableEngineConfig> InferrableEngineConfig::InferForKVCache(
               << " MB). The actual usage might be slightly larger than the estimated number.";
   }
 
-  inferred_config.kv_state_kind = KVStateKind::kKVCache;
   inferred_config.max_history_size = 0;
   return Result<InferrableEngineConfig>::Ok(inferred_config);
 }
@@ -853,7 +853,6 @@ Result<InferrableEngineConfig> InferrableEngineConfig::InferForRNNState(
               << " MB). The actual usage might be slightly larger than the estimated number.";
   }
 
-  inferred_config.kv_state_kind = KVStateKind::kRNNState;
   return Result<InferrableEngineConfig>::Ok(inferred_config);
 }
 
 
@@ -114,12 +114,8 @@ enum class SpeculativeMode : int {
   kSmallDraft = 1,
   /*! \brief The eagle-style speculative decoding. */
   kEagle = 2,
-};
-
-/*! \brief The kind of cache. */
-enum class KVStateKind : int {
-  kKVCache = 0,
-  kRNNState = 1,
+  /*! \brief The Medusa-style speculative decoding. */
+  kMedusa = 3,
 };
 
 class InferrableEngineConfig;
@@ -172,8 +168,6 @@ class EngineConfigNode : public Object {
   int prefill_chunk_size = 1024;
   /*! \brief The maximum history size for RNN state. KV cache does not need this. */
   int max_history_size = 0;
-  /*! \brief The kind of cache. Whether it's KV cache or RNN state. */
-  KVStateKind kv_state_kind = KVStateKind::kKVCache;
 
   /*************** Speculative decoding ***************/
 
@@ -216,7 +210,6 @@ struct InferrableEngineConfig {
   std::optional<int64_t> max_single_sequence_length;
   std::optional<int64_t> prefill_chunk_size;
   std::optional<int64_t> max_history_size;
-  std::optional<KVStateKind> kv_state_kind;
 
   /*! \brief Infer the config for KV cache from a given initial config. */
   TVM_DLL static Result<InferrableEngineConfig> InferForKVCache(
@@ -238,9 +231,16 @@ struct InferrableEngineConfig {
 Result<bool> ModelsUseKVCache(const std::vector<picojson::object>& model_configs);
 
 inline std::string EngineModeToString(EngineMode mode) {
-  return mode == EngineMode::kLocal         ? "local"
-         : mode == EngineMode::kInteractive ? "interactive"
-                                            : "server";
+  if (mode == EngineMode::kLocal) {
+    return "local";
+  } else if (mode == EngineMode::kInteractive) {
+    return "interactive";
+  } else if (mode == EngineMode::kServer) {
+    return "server";
+  } else {
+    LOG(FATAL) << "Invalid engine mode: " << static_cast<int>(mode);
+    throw;
+  }
 }
 
 inline EngineMode EngineModeFromString(const std::string& mode) {
@@ -252,13 +252,22 @@ inline EngineMode EngineModeFromString(const std::string& mode) {
     return EngineMode::kServer;
   } else {
     LOG(FATAL) << "Invalid engine mode string: " << mode;
+    throw;
   }
 }
 
 inline std::string SpeculativeModeToString(SpeculativeMode speculative_mode) {
-  return speculative_mode == SpeculativeMode::kDisable      ? "disable"
-         : speculative_mode == SpeculativeMode::kSmallDraft ? "small_draft"
-                                                            : "eagle";
+  if (speculative_mode == SpeculativeMode::kDisable) {
+    return "disable";
+  } else if (speculative_mode == SpeculativeMode::kSmallDraft) {
+    return "small_draft";
+  } else if (speculative_mode == SpeculativeMode::kEagle) {
+    return "eagle";
+  } else if (speculative_mode == SpeculativeMode::kMedusa) {
+    return "medusa";
+  } else {
+    LOG(FATAL) << "Invalid speculative mode: " << static_cast<int>(speculative_mode);
+  }
 }
 
 inline SpeculativeMode SpeculativeModeFromString(const std::string& speculative_mode) {
@@ -268,22 +277,11 @@ inline SpeculativeMode SpeculativeModeFromString(const std::string& speculative_
     return SpeculativeMode::kSmallDraft;
   } else if (speculative_mode == "eagle") {
     return SpeculativeMode::kEagle;
+  } else if (speculative_mode == "medusa") {
+    return SpeculativeMode::kMedusa;
   } else {
     LOG(FATAL) << "Invalid speculative mode string: " << speculative_mode;
-  }
-}
-
-inline std::string KVStateKindToString(KVStateKind kv_state_kind) {
-  return kv_state_kind == KVStateKind::kKVCache ? "kv_cache" : "rnn_State";
-}
-
-inline KVStateKind KVStateKindFromString(const std::string& kv_state_kind) {
-  if (kv_state_kind == "kv_cache") {
-    return KVStateKind::kKVCache;
-  } else if (kv_state_kind == "rnn_state") {
-    return KVStateKind::kRNNState;
-  } else {
-    LOG(FATAL) << "Invalid kv state kind string: " << kv_state_kind;
+    throw;
   }
 }
 
 
@@ -105,8 +105,7 @@ class EngineImpl : public Engine {
       model->SetPrefillChunkSize(engine_config->prefill_chunk_size);
       model->CreateKVCache(engine_config->kv_cache_page_size, engine_config->max_num_sequence,
                            engine_config->max_total_sequence_length,
-                           engine_config->prefill_chunk_size, engine_config->max_history_size,
-                           engine_config->kv_state_kind);
+                           engine_config->prefill_chunk_size, engine_config->max_history_size);
       n->model_workspaces_.push_back(
           ModelWorkspace{model->AllocEmbeddingTensor(), model->AllocHiddenStatesTensor()});
     }
@@ -161,6 +160,18 @@ class EngineImpl : public Engine {
                                              n->model_workspaces_, draft_token_workspace_manager,
                                              engine_config, n->trace_recorder_)};
           break;
+        case SpeculativeMode::kMedusa:
+          n->actions_ = {EngineAction::EagleNewRequestPrefill(n->models_,                     //
+                                                              logit_processor,                //
+                                                              sampler,                        //
+                                                              n->model_workspaces_,           //
+                                                              draft_token_workspace_manager,  //
+                                                              engine_config,                  //
+                                                              n->trace_recorder_),
+                         EngineAction::EagleBatchVerify(
+                             n->models_, logit_processor, sampler, n->model_workspaces_,
+                             draft_token_workspace_manager, engine_config, n->trace_recorder_)};
+          break;
         default:
           n->actions_ = {
               EngineAction::NewRequestPrefill(n->models_,            //
@@ -422,13 +433,9 @@ class EngineImpl : public Engine {
         json::LookupOptional<int64_t>(config, "max_history_size");
     std::optional<std::string> kv_state_kind_str =
         json::LookupOptional<std::string>(config, "kv_state_kind");
-    std::optional<KVStateKind> kv_state_kind;
-    if (kv_state_kind_str.has_value()) {
-      kv_state_kind = KVStateKindFromString(kv_state_kind_str.value());
-    }
-    InferrableEngineConfig inferrable_cfg{max_num_sequence,           max_total_sequence_length,
+    InferrableEngineConfig inferrable_cfg{max_num_sequence, max_total_sequence_length,
                                           max_single_sequence_length, prefill_chunk_size,
-                                          max_history_size,           kv_state_kind};
+                                          max_history_size};
 
     // - Get the model metadata.
     std::vector<ModelMetadata> model_metadata;
@@ -440,28 +447,13 @@ class EngineImpl : public Engine {
     if (use_kv_cache.IsErr()) {
       return TResult::Error(use_kv_cache.UnwrapErr());
     }
-    KVStateKind inferred_kv_state_kind;
     Result<InferrableEngineConfig> inferrable_cfg_res;
     if (use_kv_cache.Unwrap()) {
-      inferred_kv_state_kind = KVStateKind::kKVCache;
-      // - Check if the kv state kind from config is valid.
-      if (kv_state_kind.has_value() && kv_state_kind.value() != inferred_kv_state_kind) {
-        return TResult::Error(
-            "Invalid kv state kind in EngineConfig. The models use KV cache, but RNN state is "
-            "specified in EngineConfig.");
-      }
       // - Infer configuration.
       inferrable_cfg_res = InferrableEngineConfig::InferForKVCache(
           mode, device_, gpu_memory_utilization, model_configs, model_metadata, inferrable_cfg,
           verbose);
     } else {
-      inferred_kv_state_kind = KVStateKind::kRNNState;
-      // - Check if the kv state kind from config is valid.
-      if (kv_state_kind.has_value() && kv_state_kind.value() != inferred_kv_state_kind) {
-        return TResult::Error(
-            "Invalid kv state kind in EngineConfig. The models use RNN state, but KV cache is "
-            "specified in EngineConfig.");
-      }
       // - Infer configuration.
       inferrable_cfg_res = InferrableEngineConfig::InferForRNNState(
           mode, device_, gpu_memory_utilization, model_configs, model_metadata, inferrable_cfg,
@@ -477,7 +469,6 @@ class EngineImpl : public Engine {
     ICHECK(inferrable_cfg.max_single_sequence_length.has_value());
     ICHECK(inferrable_cfg.prefill_chunk_size.has_value());
     ICHECK(inferrable_cfg.max_history_size.has_value());
-    ICHECK(inferrable_cfg.kv_state_kind.has_value());
     return TResult::Ok(EngineConfig::FromJSONAndInferredConfig(config, inferrable_cfg));
   }
 
 
@@ -211,6 +211,26 @@ RequestStateEntry PreemptLastRunningRequestStateEntry(
   return rsentry;
 }
 
+std::pair<NDArray, std::vector<SampleResult>> ApplyLogitProcessorAndSample(
+    const LogitProcessor& logit_processor, const Sampler& sampler, const NDArray& logits,
+    const Array<GenerationConfig>& generation_cfg, const Array<String>& request_ids,
+    const Array<RequestModelState>& mstates, const std::vector<RandomGenerator*>& rngs,
+    const std::vector<int>& sample_indices) {
+  // - Update logits.
+  logit_processor->InplaceUpdateLogits(logits, generation_cfg, mstates, request_ids);
+
+  // - Compute probability distributions.
+  NDArray probs_on_device =
+      logit_processor->ComputeProbsFromLogits(logits, generation_cfg, request_ids);
+
+  // - Sample tokens.
+  NDArray renormalized_probs = sampler->BatchRenormalizeProbsByTopP(probs_on_device, sample_indices,
+                                                                    request_ids, generation_cfg);
+  std::vector<SampleResult> sample_results = sampler->BatchSampleTokensWithProbAfterTopP(
+      renormalized_probs, sample_indices, request_ids, generation_cfg, rngs);
+  return {std::move(probs_on_device), std::move(sample_results)};
+}
+
 }  // namespace serve
 }  // namespace llm
 }  // namespace mlc
@@ -75,6 +75,24 @@ inline std::vector<RequestStateEntry> GetRunningRequestStateEntries(const Engine
   return rsentries;
 }
 
+/*!
+ * \brief Apply the logit processor to the logits and sample one token for each request.
+ * \param logit_processor The logit processor to apply.
+ * \param sampler The sampler to sample tokens.
+ * \param logits The logits to process.
+ * \param generation_cfg The generation configurations of the requests.
+ * \param request_ids The request ids.
+ * \param mstates The model states of the requests.
+ * \param rngs The random generators of the requests.
+ * \param sample_indices The indices of the requests to sample.
+ * \return The processed logits and the sampled results.
+ */
+std::pair<NDArray, std::vector<SampleResult>> ApplyLogitProcessorAndSample(
+    const LogitProcessor& logit_processor, const Sampler& sampler, const NDArray& logits,
+    const Array<GenerationConfig>& generation_cfg, const Array<String>& request_ids,
+    const Array<RequestModelState>& mstates, const std::vector<RandomGenerator*>& rngs,
+    const std::vector<int>& sample_indices);
+
 }  // namespace serve
 }  // namespace llm
 }  // namespace mlc