address reviewer comments

alec-flowers · alec-flowers · commit ee0dad72f21a · 2025-05-27T22:54:29.000-07:00
diff --git a/components/metrics/src/bin/mock_worker.rs b/components/metrics/src/bin/mock_worker.rs
@@ -115,6 +115,7 @@ fn mock_stats_handler(_stats: EndpointStats) -> serde_json::Value {
     let gpu_cache_usage_perc = rand::rng().random_range(0.0..=1.0);
     let gpu_prefix_cache_hit_rate = rand::rng().random_range(0.0..=1.0);
     let stats = ForwardPassMetrics {
+        data_parallel_rank: None, // Default for backwards compatibility
         request_active_slots,
         request_total_slots,
         kv_active_blocks,
diff --git a/launch/dynamo-run/src/subprocess/vllm_v1_inc.py b/launch/dynamo-run/src/subprocess/vllm_v1_inc.py
@@ -5,9 +5,7 @@
 # Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
 
 # Setup checklist:
-# - We are in a virtualenv with vllm installed - and patched if using kv routing.
-# - `libdynamo_llm_capi.so` is in system lib path or it's containing folder is in LD_LIBRARY_PATH
-#   It builds in target/debug/ by default.
+# - We are in a virtualenv with vllm installed. Must be newer than v0.9.0 (currently pre-release)
 
 import argparse
 import asyncio
@@ -56,15 +54,17 @@ class Config:
     model_name: Optional[str]
     tensor_parallel_size: int
     kv_block_size: int
+    context_length: int
     extra_engine_args: str
 
 
 class DynamoStatLoggerPublisher(StatLoggerBase):
     """Stat logger publisher. Wrapper for the KvMetricsPublisher to match the StatLoggerBase interface."""
 
-    def __init__(self, component: Component) -> None:
+    def __init__(self, component: Component, dp_rank: int) -> None:
         self.inner = KvMetricsPublisher()
         self.inner.create_endpoint(component)
+        self.dp_rank = dp_rank
 
     def record(
         self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]
@@ -79,7 +79,9 @@ def record(
                 / scheduler_stats.prefix_cache_stats.queries
             )
 
+        # TODO Manage DP Ranks in metrics aggregation.
         self.inner.publish(
+            data_parallel_rank=self.dp_rank,
             request_active_slots=scheduler_stats.num_running_reqs,
             request_total_slots=0,  # TODO - remove from metrics
             kv_active_blocks=0,  # TODO - need to calculate this
@@ -99,12 +101,11 @@ class StatLoggerFactory:
     def __init__(self, component: Component) -> None:
         self.component = component
 
-    def create_stat_logger(self) -> StatLoggerBase:
-        return DynamoStatLoggerPublisher(self.component)
+    def create_stat_logger(self, dp_rank: int) -> StatLoggerBase:
+        return DynamoStatLoggerPublisher(self.component, dp_rank)
 
-    # TODO investigate if rank is imporant. Do I need to only do for rank 0?
-    def __call__(self, vllm_config: VllmConfig, rank: int) -> StatLoggerBase:
-        return self.create_stat_logger()
+    def __call__(self, vllm_config: VllmConfig, dp_rank: int) -> StatLoggerBase:
+        return self.create_stat_logger(dp_rank=dp_rank)
 
 
 class RequestHandler:
@@ -172,8 +173,13 @@ async def init(runtime: DistributedRuntime, config: Config):
     await component.create_service()
 
     endpoint = component.endpoint(config.endpoint)
+    print(f"BLOCK SIZE: {config.kv_block_size}")
     await register_llm(
-        ModelType.Backend, endpoint, config.model_path, config.model_name
+        ModelType.Backend,
+        endpoint,
+        config.model_path,
+        config.model_name,
+        kv_cache_block_size=config.kv_block_size,
     )
 
     arg_map = {
@@ -183,13 +189,20 @@ async def init(runtime: DistributedRuntime, config: Config):
         "skip_tokenizer_init": True,
         "disable_log_requests": True,
         "enable_prefix_caching": True,
-        "block_size": config.kv_block_size,
         # KV routing relies on logging KV metrics
         "disable_log_stats": False,
         "kv_events_config": KVEventsConfig(
             enable_kv_cache_events=True, publisher="zmq"
         ),
     }
+
+    if config.context_length:
+        # Usually we want it to default to the max (from tokenizer_config.json)
+        arg_map["max_model_len"] = config.context_length
+
+    if config.kv_block_size > 0:
+        arg_map["block_size"] = config.kv_block_size
+
     if config.extra_engine_args != "":
         json_map = {}
         # extra_engine_args is a filename
@@ -271,6 +284,12 @@ def cmd_line_args():
     parser.add_argument(
         "--kv-block-size", type=int, default=16, help="Size of a KV cache block."
     )
+    parser.add_argument(
+        "--context-length",
+        type=int,
+        default=None,
+        help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
+    )
     parser.add_argument(
         "--extra-engine-args",
         type=str,
@@ -302,6 +321,7 @@ def cmd_line_args():
     config.endpoint = parsed_endpoint_name
     config.tensor_parallel_size = args.tensor_parallel_size
     config.kv_block_size = args.kv_block_size
+    config.context_length = args.context_length
     config.extra_engine_args = args.extra_engine_args
 
     return config
diff --git a/lib/bindings/python/rust/llm/kv.rs b/lib/bindings/python/rust/llm/kv.rs
@@ -97,6 +97,7 @@ impl KvMetricsPublisher {
     fn publish(
         &self,
         _py: Python,
+        data_parallel_rank: u32,
         request_active_slots: u64,
         request_total_slots: u64,
         kv_active_blocks: u64,
@@ -108,6 +109,7 @@ impl KvMetricsPublisher {
         self.inner
             .publish(
                 llm_rs::kv_router::protocols::ForwardPassMetrics {
+                    data_parallel_rank: Some(data_parallel_rank),
                     request_active_slots,
                     request_total_slots,
                     kv_active_blocks,
diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi
@@ -364,10 +364,14 @@ class KvMetricsPublisher:
 
     def publish(
         self,
+        data_parallel_rank: int,
         request_active_slots: int,
         request_total_slots: int,
         kv_active_blocks: int,
         kv_total_blocks: int,
+        num_requests_waiting: int,
+        gpu_cache_usage_perc: float,
+        gpu_prefix_cache_hit_rate: float,
     ) -> None:
         """
         Update the KV metrics being reported.
@@ -637,7 +641,7 @@ class ModelType:
     """What type of request this model needs: Chat, Component or Backend (pre-processed)"""
     ...
 
-async def register_llm(model_type: ModelType, endpoint: Endpoint, model_path: str, model_name: Optional[str]) -> None:
+async def register_llm(model_type: ModelType, endpoint: Endpoint, model_path: str, model_name: Optional[str] = None, context_length: Optional[int] = None, kv_cache_block_size: Optional[int] = None) -> None:
     """Attach the model at path to the given endpoint, and advertise it as model_type"""
     ...
 
diff --git a/lib/llm/src/kv_router/protocols.rs b/lib/llm/src/kv_router/protocols.rs
@@ -41,6 +41,7 @@ pub struct WorkerSelectionResult {
 
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct ForwardPassMetrics {
+    pub data_parallel_rank: Option<u32>, // backwards compatible
     pub request_active_slots: u64,
     pub request_total_slots: u64,
     pub kv_active_blocks: u64,
diff --git a/lib/llm/src/kv_router/publisher.rs b/lib/llm/src/kv_router/publisher.rs
@@ -60,7 +60,7 @@ impl KvEventPublisher {
     }
 
     pub fn publish(&self, event: KvCacheEvent) -> Result<(), mpsc::error::SendError<KvCacheEvent>> {
-        tracing::info!("Publish event: {:?}", event);
+        tracing::trace!("Publish event: {:?}", event);
         self.tx.send(event)
     }
 
@@ -90,6 +90,7 @@ fn start_publish_task(
 
 pub struct KvEventPublisherFromZmq {
     kv_block_size: usize,
+    processor_handle: Option<tokio::task::JoinHandle<()>>,
     zmq_handle: Option<tokio::task::JoinHandle<()>>,
     zmq_token: Option<dynamo_runtime::CancellationToken>,
 }
@@ -98,6 +99,7 @@ impl KvEventPublisherFromZmq {
     pub fn new(kv_block_size: usize) -> Self {
         Self {
             kv_block_size,
+            processor_handle: None,
             zmq_handle: None,
             zmq_token: None,
         }
@@ -126,20 +128,23 @@ impl KvEventPublisherFromZmq {
                     zmq_endpoint,
                     zmq_topic,
                     raw_tx,
-                    zmq_token,
+                    zmq_token.clone(),
                 )),
         );
 
-        component
-            .drt()
-            .runtime()
-            .secondary()
-            .spawn(start_event_processor(
-                raw_rx,
-                component,
-                worker_id,
-                kv_block_size,
-            ));
+        self.processor_handle = Some(
+            component
+                .drt()
+                .runtime()
+                .secondary()
+                .spawn(start_event_processor(
+                    raw_rx,
+                    component,
+                    worker_id,
+                    kv_block_size,
+                    zmq_token,
+                ))
+        );
     }
 
     pub fn shutdown(&mut self) {
@@ -149,6 +154,9 @@ impl KvEventPublisherFromZmq {
         if let Some(handle) = self.zmq_handle.take() {
             handle.abort();
         }
+        if let Some(handle) = self.processor_handle.take() {
+            handle.abort();
+        }
     }
 }
 
@@ -157,24 +165,45 @@ async fn start_event_processor<P: EventPublisher>(
     component: P,
     worker_id: i64,
     kv_block_size: usize,
+    cancellation_token: dynamo_runtime::CancellationToken,
 ) {
-    while let Some((seq, payload)) = raw_rx.recv().await {
-        match rmps::from_slice::<KvEventBatch>(&payload) {
-            Ok(batch) => {
-                for raw_evt in batch.events.into_iter() {
-                    if let Some(event) = convert_event(raw_evt, seq, kv_block_size) {
-                        let router_event = RouterEvent::new(worker_id, event);
-                        if let Err(e) = component.publish(KV_EVENT_SUBJECT, &router_event).await {
-                            tracing::warn!("Failed to publish router event: {}", e);
+    loop {
+        tokio::select! {
+            // Check for cancellation
+            _ = cancellation_token.cancelled() => {
+                tracing::debug!("Event processor received cancellation signal");
+                break;
+            }
+
+            // Process incoming messages
+            msg = raw_rx.recv() => {
+                match msg {
+                    Some((seq, payload)) => {
+                        match rmps::from_slice::<KvEventBatch>(&payload) {
+                            Ok(batch) => {
+                                for raw_evt in batch.events.into_iter() {
+                                    if let Some(event) = convert_event(raw_evt, seq, kv_block_size) {
+                                        let router_event = RouterEvent::new(worker_id, event);
+                                        if let Err(e) = component.publish(KV_EVENT_SUBJECT, &router_event).await {
+                                            tracing::warn!(error=%e, "Failed to publish router event.");
+                                        }
+                                    }
+                                }
+                            }
+                            Err(e) => {
+                                tracing::warn!(error=%e, "Failed to decode KVEventBatch msgpack");
+                            }
                         }
                     }
+                    None => {
+                        tracing::debug!("Event processor channel closed");
+                        break;
+                    }
                 }
             }
-            Err(e) => {
-                tracing::warn!("Failed to decode KVEventBatch msgpack: {}", e);
-            }
         }
     }
+    tracing::debug!("Event processor exiting");
 }
 
 async fn start_zmq_listener(
@@ -183,7 +212,7 @@ async fn start_zmq_listener(
     raw_tx: mpsc::UnboundedSender<(u64, Vec<u8>)>,
     zmq_token: dynamo_runtime::CancellationToken,
 ) {
-    tracing::info!(
+    tracing::debug!(
         "KVEventPublisher connecting to ZMQ endpoint {} (topic '{}')",
         zmq_endpoint,
         zmq_topic
@@ -217,34 +246,34 @@ async fn start_zmq_listener(
                         // We expect multipart frames: [topic, seq, payload]
                         let mut frames: Vec<Vec<u8>> = msg.into_vec().into_iter().map(|frame| frame.to_vec()).collect();
 
-                        if frames.len() == 3 {
-                            let payload = frames.remove(2);
-                            let seq_bytes = frames.remove(1);
+                        if frames.len() != 3 {
+                            tracing::warn!(expected=3, actual=%frames.len(), "Received unexpected ZMQ frame count");
+                            continue;
+                        }
+                        let payload = frames.remove(2);
+                        let seq_bytes = frames.remove(1);
 
-                            if seq_bytes.len() != 8 {
-                                tracing::warn!("Invalid sequence number frame len={}", seq_bytes.len());
-                                continue;
-                            }
+                        if seq_bytes.len() != 8 {
+                            tracing::warn!(expected=8, actual=%seq_bytes.len(), "Invalid sequence number byte length");
+                            continue;
+                        }
 
-                            let seq = u64::from_be_bytes(seq_bytes.try_into().unwrap());
-                            if raw_tx.send((seq, payload)).is_err() {
-                                tracing::warn!("Failed to send message to channel - receiver dropped");
-                                break;
-                            }
-                        } else {
-                            tracing::warn!("Received unexpected ZMQ frame count: {}", frames.len());
+                        let seq = u64::from_be_bytes(seq_bytes.try_into().unwrap());
+                        if raw_tx.send((seq, payload)).is_err() {
+                            tracing::warn!("Failed to send message to channel - receiver dropped");
+                            break;
                         }
                     }
                     Err(e) => {
-                        tracing::warn!("Error reading from ZMQ socket: {}", e);
+                        tracing::warn!(error=%e, "Error reading from ZMQ socket");
                         // Brief sleep to avoid tight error loop
                         tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
                     }
                 }
             }
         }
     }
-    tracing::info!("ZMQ listener exiting");
+    tracing::debug!("ZMQ listener exiting");
 }
 
 /// Convert a raw event coming from the ZMQ channel into the internal
@@ -355,17 +384,14 @@ struct KvEventBatch {
 #[derive(Debug, Deserialize, Serialize)]
 #[serde(tag = "type")] // msgspec encodes variant tag as a string when `tag=True`
 enum RawKvEvent {
-    #[serde(rename = "BlockStored")]
     BlockStored {
         block_hashes: Vec<i64>,
         parent_block_hash: Option<i64>,
         token_ids: Vec<u32>,
         block_size: usize,
         lora_id: Option<u64>,
     },
-    #[serde(rename = "BlockRemoved")]
     BlockRemoved { block_hashes: Vec<i64> },
-    #[serde(rename = "AllBlocksCleared")]
     AllBlocksCleared,
 }
 
@@ -620,6 +646,8 @@ mod tests_startup_helpers {
         };
         let payload = rmps::to_vec(&batch).unwrap();
 
+        let token = dynamo_runtime::CancellationToken::new();
+
         // 2) channel feeding the processor
         let (tx, rx) = mpsc::unbounded_channel::<(u64, Vec<u8>)>();
         tx.send((123, payload.clone())).unwrap(); // seq = 123
@@ -629,7 +657,7 @@ mod tests_startup_helpers {
         let (comp, published) = MockComponent::new();
 
         // 4) run the function under test (let it consume exactly one msg)
-        let handle = tokio::spawn(start_event_processor(rx, comp, worker_id, kv_block_size));
+        let handle = tokio::spawn(start_event_processor(rx, comp, worker_id, kv_block_size, token));
 
         tokio::time::timeout(std::time::Duration::from_secs(1), handle)
             .await
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs