huggingface
diff --git a/‎launcher/src/main.rs‎
Lines changed: 19 additions & 4 deletions b/‎launcher/src/main.rs‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎proto/generate.proto‎
Lines changed: 17 additions & 0 deletions b/‎proto/generate.proto‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎router/client/src/client.rs‎
Lines changed: 16 additions & 0 deletions b/‎router/client/src/client.rs‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎router/client/src/sharded_client.rs‎
Lines changed: 17 additions & 1 deletion b/‎router/client/src/sharded_client.rs‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎router/src/infer.rs‎
Lines changed: 81 additions & 81 deletions b/‎router/src/infer.rs‎
Lines changed: 81 additions & 81 deletions
@@ -39,8 +39,12 @@ struct Args {
     max_input_length: usize,
     #[clap(default_value = "1512", long, env)]
     max_total_tokens: usize,
-    #[clap(default_value = "32", long, env)]
-    max_batch_size: usize,
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
+    #[clap(default_value = "1.2", long, env)]
+    waiting_served_ratio: f32,
+    #[clap(default_value = "32000", long, env)]
+    max_batch_total_tokens: u32,
     #[clap(default_value = "20", long, env)]
     max_waiting_tokens: usize,
     #[clap(default_value = "3000", long, short, env)]
@@ -93,6 +97,8 @@ fn main() -> ExitCode {
         max_input_length,
         max_total_tokens,
         max_batch_size,
+        max_batch_total_tokens,
+        waiting_served_ratio,
         max_waiting_tokens,
         port,
         shard_uds_path,
@@ -380,8 +386,8 @@ fn main() -> ExitCode {
         max_input_length.to_string(),
         "--max-total-tokens".to_string(),
         max_total_tokens.to_string(),
-        "--max-batch-size".to_string(),
-        max_batch_size.to_string(),
+        "--waiting-served-ratio".to_string(),
+        waiting_served_ratio.to_string(),
         "--max-waiting-tokens".to_string(),
         max_waiting_tokens.to_string(),
         "--port".to_string(),
@@ -392,6 +398,15 @@ fn main() -> ExitCode {
         model_id,
     ];
 
+    // Deprecate max_batch_size
+    if let Some(max_batch_size) = max_batch_size {
+        argv.push("--max-batch-size".to_string());
+        argv.push(max_batch_size.to_string())
+    } else {
+        argv.push("--max-batch-total-tokens".to_string());
+        argv.push(max_batch_total_tokens.to_string())
+    }
+
     // Model optional revision
     if let Some(ref revision) = revision {
         argv.push("--revision".to_string());
 
@@ -9,6 +9,8 @@ service TextGenerationService {
     rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
     /// Empties batch cache
     rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
+    /// Remove requests from a cached batch
+    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
     /// Prefill batch and decode first token
     rpc Prefill (PrefillRequest) returns (PrefillResponse);
     /// Decode token for a list of prefilled batches
@@ -89,6 +91,8 @@ message Batch {
     repeated Request requests = 2;
     /// Batch size (==len(requests))
     uint32 size = 3;
+    /// Maximum number of tokens this batch will grow to
+    uint32 max_tokens = 4;
 }
 
 enum FinishReason {
@@ -134,6 +138,19 @@ message Generation {
     GeneratedText generated_text = 7;
 }
 
+message FilterBatchRequest {
+    /// Batch ID
+    uint64 batch_id = 1;
+    /// Requests to keep
+    repeated Request keep_requests = 2;
+}
+
+message FilterBatchResponse {
+    /// Filtered Batch (cached)
+    Batch batch = 1;
+}
+
+
 message PrefillRequest {
     /// Batch
     Batch batch = 1;
 
@@ -70,6 +70,22 @@ impl Client {
         Ok(())
     }
 
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        keep_requests: Vec<Request>,
+    ) -> Result<Option<Batch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            keep_requests,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+
     /// Generate one token for each request in the given batch
     ///
     /// Returns Generation for each request in batch
 
@@ -1,6 +1,6 @@
 /// Multi shard Client
 use crate::Result;
-use crate::{Batch, Client, Generation, ShardInfo};
+use crate::{Batch, Client, Generation, Request, ShardInfo};
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;
@@ -59,6 +59,22 @@ impl ShardedClient {
         join_all(futures).await.into_iter().collect()
     }
 
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        keep_requests: Vec<Request>,
+    ) -> Result<Option<Batch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, keep_requests.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+
     /// Generate one token for each request in the given batch
     ///
     /// Returns Generation for each request in batch
 
@@ -39,20 +39,23 @@ impl Infer {
     pub(crate) fn new(
         client: ShardedClient,
         validation: Validation,
-        max_batch_size: usize,
+        waiting_served_ratio: f32,
+        max_batch_total_tokens: u32,
         max_waiting_tokens: usize,
         max_concurrent_requests: usize,
+        requires_padding: bool,
     ) -> Self {
         // Infer shared state
-        let queue = Queue::new();
+        let queue = Queue::new(requires_padding);
         let shared = Arc::new(Shared {
             batching_task: Notify::new(),
         });
 
         // Spawn batching background task that contains all the inference logic
         tokio::spawn(batching_task(
             client,
-            max_batch_size,
+            waiting_served_ratio,
+            max_batch_total_tokens,
             max_waiting_tokens,
             queue.clone(),
             shared.clone(),
@@ -232,18 +235,12 @@ impl Infer {
 /// Batches requests and sends them to the inference server
 async fn batching_task(
     mut client: ShardedClient,
-    max_batch_size: usize,
+    waiting_served_ratio: f32,
+    max_batch_total_tokens: u32,
     max_waiting_tokens: usize,
     queue: Queue,
     shared: Arc<Shared>,
 ) {
-    // Minimum batch size after which we try to add more requests
-    let limit_min_batch_size = if max_batch_size > 1 {
-        (max_batch_size / 2) as u32
-    } else {
-        0
-    };
-
     // Infinite loop
     loop {
         // Wait for a notification from the Infer struct
@@ -252,7 +249,9 @@ async fn batching_task(
         // Get the next batch from the queue
         // This batch might be smaller than the maximum batch size if there are not enough requests
         // waiting in the queue
-        while let Some((mut entries, batch, span)) = queue.next_batch(None, max_batch_size).await {
+        while let Some((mut entries, batch, span)) =
+            queue.next_batch(None, max_batch_total_tokens).await
+        {
             let mut cached_batch = prefill(&mut client, batch, &mut entries)
                 .instrument(span)
                 .await;
@@ -263,48 +262,50 @@ async fn batching_task(
             while let Some(batch) = cached_batch {
                 // Get current batch info
                 let batch_size = batch.size;
+                let batch_max_tokens = batch.max_tokens;
                 let mut batches = vec![batch];
                 metrics::gauge!("tgi_batch_current_size", batch_size as f64);
 
-                // If the current batch is too small, we try to add more requests to it
-                if batch_size <= limit_min_batch_size {
-                    let min_size = match waiting_tokens {
-                        // If we didn't onboard any new requests since >= max_waiting_tokens, we try
-                        // to add a new batch even though its size might be small
-                        _ if waiting_tokens >= max_waiting_tokens => None,
-                        // Minimum size criteria
-                        _ => Some(limit_min_batch_size as usize),
-                    };
-
-                    // Try to get a new batch
-                    if let Some((mut new_entries, new_batch, span)) = queue
-                        .next_batch(min_size, max_batch_size - batch_size as usize)
-                        .await
-                    {
-                        entries.iter_mut().for_each(|(_, entry)| {
-                            // Create a new span to add the info that this entry is waiting
-                            // because a new batch is being computed
-                            let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
-                            // Add relationships
-                            span.follows_from(&entry_waiting_span);
-                            entry_waiting_span.follows_from(&span);
-                            // Update entry
-                            entry.temp_span = Some(entry_waiting_span);
-                        });
-
-                        // Generate one token for this new batch to have the attention past in cache
-                        let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
-                            .instrument(span)
-                            .await;
-                        // Reset waiting counter
-                        waiting_tokens = 1;
-                        // Extend current batch with the new batch
-                        if let Some(new_cached_batch) = new_cached_batch {
-                            entries.extend(new_entries);
-                            batches.push(new_cached_batch);
-                        }
+                let min_size = match waiting_tokens {
+                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
+                    // to add a new batch even though its size might be small
+                    _ if waiting_tokens >= max_waiting_tokens => None,
+                    // Minimum size criteria
+                    _ => Some((batch_size as f32 * waiting_served_ratio).floor() as usize),
+                };
+
+                let token_budget = max_batch_total_tokens - batch_max_tokens;
+
+                // Try to get a new batch
+                if let Some((mut new_entries, new_batch, span)) =
+                    queue.next_batch(min_size, token_budget).await
+                {
+                    // Tracking metrics
+
+                    entries.iter_mut().for_each(|(_, entry)| {
+                        // Create a new span to add the info that this entry is waiting
+                        // because a new batch is being computed
+                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
+                        // Add relationships
+                        span.follows_from(&entry_waiting_span);
+                        entry_waiting_span.follows_from(&span);
+                        // Update entry
+                        entry.temp_span = Some(entry_waiting_span);
+                    });
+
+                    // Generate one token for this new batch to have the attention past in cache
+                    let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
+                        .instrument(span)
+                        .await;
+                    // Reset waiting counter
+                    waiting_tokens = 1;
+                    // Extend current batch with the new batch
+                    if let Some(new_cached_batch) = new_cached_batch {
+                        entries.extend(new_entries);
+                        batches.push(new_cached_batch);
                     }
                 }
+
                 // Create span for this batch to add context to inference calls
                 let next_batch_size = entries.len();
                 let next_batch_span =
@@ -341,22 +342,11 @@ async fn prefill(
 
     match client.prefill(batch).await {
         Ok((generations, next_batch)) => {
+            // Send generated tokens and filter stopped entries
             filter_send_generations(generations, entries);
 
             // Filter next batch and remove requests that were stopped
-            let next_batch = match next_batch {
-                None => None,
-                Some(batch) => {
-                    let id = batch.id;
-                    let next_batch = filter_batch(batch, entries);
-                    // Next batch is now empty
-                    // Clear it from the Python shards cache
-                    if next_batch.is_none() {
-                        let _ = client.clear_cache(Some(id)).await;
-                    }
-                    next_batch
-                }
-            };
+            let next_batch = filter_batch(client, next_batch, entries).await;
 
             metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
             metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
@@ -384,22 +374,11 @@ async fn decode(
 
     match client.decode(batches).await {
         Ok((generations, next_batch)) => {
+            // Send generated tokens and filter stopped entries
             filter_send_generations(generations, entries);
 
             // Filter next batch and remove requests that were stopped
-            let next_batch = match next_batch {
-                None => None,
-                Some(batch) => {
-                    let id = batch.id;
-                    let next_batch = filter_batch(batch, entries);
-                    // Next batch is now empty
-                    // Clear it from the Python shards cache
-                    if next_batch.is_none() {
-                        let _ = client.clear_cache(Some(id)).await;
-                    }
-                    next_batch
-                }
-            };
+            let next_batch = filter_batch(client, next_batch, entries).await;
 
             metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
             metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
@@ -419,14 +398,35 @@ async fn decode(
 
 /// Filter a `batch` and remove all requests not present in `entries`
 #[instrument(skip_all)]
-fn filter_batch(mut batch: Batch, entries: &IntMap<u64, Entry>) -> Option<Batch> {
+async fn filter_batch(
+    client: &mut ShardedClient,
+    next_batch: Option<Batch>,
+    entries: &IntMap<u64, Entry>,
+) -> Option<Batch> {
+    let mut batch = next_batch?;
+
+    // No need to filter
+    if batch.size as usize == entries.len() {
+        return Some(batch);
+    }
+
+    let id = batch.id;
+
+    // Retain only requests that are still in entries
     batch.requests.retain(|r| entries.contains_key(&r.id));
-    let size = batch.requests.len();
-    if size == 0 {
-        return None;
+
+    if batch.requests.is_empty() {
+        // All requests have been filtered out
+        // Next batch is now empty
+        // Clear it from the Python shards cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.clear_cache(Some(id)).await.unwrap();
+        None
+    } else {
+        // Filter Python shard cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.filter_batch(id, batch.requests).await.unwrap()
     }
-    batch.size = size as u32;
-    Some(batch)
 }
 
 /// Send one or multiple `InferStreamResponse` to Infer for all `entries`