fix: Integration tests fixes (#2161)

keivenchang · web-flow · commit f10e44cae9f4 · 2025-07-31T15:07:06.000-07:00
Co-authored-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
diff --git a/lib/runtime/src/component/component.rs b/lib/runtime/src/component/component.rs
@@ -86,27 +86,17 @@ mod tests {
     // todo - make a distributed runtime fixture
     // todo - two options - fully mocked or integration test
     #[tokio::test]
-    async fn test_publish() {
+    async fn test_publish_and_subscribe() {
         let rt = Runtime::from_current().unwrap();
         let dtr = DistributedRuntime::from_settings(rt.clone()).await.unwrap();
-        let ns = dtr.namespace("test".to_string()).unwrap();
-        let cp = ns.component("component".to_string()).unwrap();
-        cp.publish("test", &"test".to_string()).await.unwrap();
-        rt.shutdown();
-    }
-
-    #[tokio::test]
-    async fn test_subscribe() {
-        let rt = Runtime::from_current().unwrap();
-        let dtr = DistributedRuntime::from_settings(rt.clone()).await.unwrap();
-        let ns = dtr.namespace("test".to_string()).unwrap();
-        let cp = ns.component("component".to_string()).unwrap();
+        let ns = dtr.namespace("test_component".to_string()).unwrap();
+        let cp = ns.component("test_component".to_string()).unwrap();
 
-        // Create a subscriber
-        let mut subscriber = ns.subscribe("test").await.unwrap();
+        // Create a subscriber on the component
+        let mut subscriber = cp.subscribe("test_event").await.unwrap();
 
-        // Publish a message
-        cp.publish("test", &"test_message".to_string())
+        // Publish a message from the component
+        cp.publish("test_event", &"test_message".to_string())
             .await
             .unwrap();
 
diff --git a/lib/runtime/src/component/namespace.rs b/lib/runtime/src/component/namespace.rs
@@ -99,22 +99,24 @@ mod tests {
     async fn test_publish() {
         let rt = Runtime::from_current().unwrap();
         let dtr = DistributedRuntime::from_settings(rt.clone()).await.unwrap();
-        let ns = dtr.namespace("test".to_string()).unwrap();
-        ns.publish("test", &"test".to_string()).await.unwrap();
+        let ns = dtr.namespace("test_namespace_publish".to_string()).unwrap();
+        ns.publish("test_event", &"test".to_string()).await.unwrap();
         rt.shutdown();
     }
 
     #[tokio::test]
     async fn test_subscribe() {
         let rt = Runtime::from_current().unwrap();
         let dtr = DistributedRuntime::from_settings(rt.clone()).await.unwrap();
-        let ns = dtr.namespace("test".to_string()).unwrap();
+        let ns = dtr
+            .namespace("test_namespace_subscribe".to_string())
+            .unwrap();
 
         // Create a subscriber
-        let mut subscriber = ns.subscribe("test").await.unwrap();
+        let mut subscriber = ns.subscribe("test_event").await.unwrap();
 
         // Publish a message
-        ns.publish("test", &"test_message".to_string())
+        ns.publish("test_event", &"test_message".to_string())
             .await
             .unwrap();
 
diff --git a/lib/runtime/src/http_server.rs b/lib/runtime/src/http_server.rs
@@ -77,7 +77,7 @@ impl crate::traits::DistributedRuntimeProvider for HttpMetricsRegistry {
 
 impl MetricsRegistry for HttpMetricsRegistry {
     fn basename(&self) -> String {
-        "http_server".to_string()
+        "dynamo".to_string()
     }
 
     fn parent_hierarchy(&self) -> Vec<String> {
@@ -100,7 +100,7 @@ impl HttpServerState {
         // Note: This metric is created at the DRT level (no namespace), so we manually add "dynamo_" prefix
         // to maintain consistency with the project's metric naming convention
         let uptime_gauge = http_metrics_registry.as_ref().create_gauge(
-            "dynamo_uptime_seconds",
+            "system_uptime_seconds",
             "Total uptime of the DistributedRuntime in seconds",
             &[],
         )?;
@@ -368,9 +368,9 @@ mod tests {
         println!("Full metrics response:\n{}", response);
 
         let expected = "\
-# HELP dynamo_uptime_seconds Total uptime of the DistributedRuntime in seconds
-# TYPE dynamo_uptime_seconds gauge
-dynamo_uptime_seconds{namespace=\"http_server\"} 42
+# HELP dynamo_system_uptime_seconds Total uptime of the DistributedRuntime in seconds
+# TYPE dynamo_system_uptime_seconds gauge
+dynamo_system_uptime_seconds{namespace=\"dynamo\"} 42
 ";
         assert_eq!(response, expected);
     }
diff --git a/lib/runtime/src/metrics.rs b/lib/runtime/src/metrics.rs
@@ -797,7 +797,7 @@ mod test_prefixes {
         println!("\n=== Testing Invalid Namespace Behavior ===");
 
         // Create a namespace with invalid name (contains hyphen)
-        let invalid_namespace = drt.namespace("test-namespace").unwrap();
+        let invalid_namespace = drt.namespace("@@123").unwrap();
 
         // Debug: Let's see what the hierarchy looks like
         println!(
@@ -810,15 +810,15 @@ mod test_prefixes {
         );
         println!("Invalid namespace prefix: '{}'", invalid_namespace.prefix());
 
-        // Try to create a metric - this should fail because the namespace name will be used in the metric name
+        // Try to create a metric - this should fail because "@@123" gets stripped to "" which is invalid
         let result = invalid_namespace.create_counter("test_counter", "A test counter", &[]);
-        println!("Result with invalid namespace 'test-namespace':");
+        println!("Result with invalid namespace '@@123':");
         println!("{:?}", result);
 
-        // The result should be an error from Prometheus
+        // The result should be an error because empty metric names are invalid
         assert!(
             result.is_err(),
-            "Creating metric with invalid namespace should fail"
+            "Creating metric with namespace '@@123' should fail because it gets stripped to empty string"
         );
 
         // For comparison, show a valid namespace works
@@ -926,15 +926,15 @@ testnamespace_testgauge{{component="testcomponent",namespace="testnamespace"}} 5
         println!("{}", namespace_output);
 
         let expected_namespace_output = format!(
-            r#"# HELP testintcounter A test int counter
-# TYPE testintcounter counter
-testintcounter{{namespace="testnamespace"}} 12345
-# HELP testnamespace_testcounter A test counter
+            r#"# HELP testnamespace_testcounter A test counter
 # TYPE testnamespace_testcounter counter
 testnamespace_testcounter{{component="testcomponent",endpoint="testendpoint",namespace="testnamespace"}} 123.456789
 # HELP testnamespace_testgauge A test gauge
 # TYPE testnamespace_testgauge gauge
 testnamespace_testgauge{{component="testcomponent",namespace="testnamespace"}} 50000
+# HELP testnamespace_testintcounter A test int counter
+# TYPE testnamespace_testintcounter counter
+testnamespace_testintcounter{{namespace="testnamespace"}} 12345
 "#
         );
 
@@ -1015,9 +1015,6 @@ testhistogram_bucket{{le="10"}} 3
 testhistogram_bucket{{le="+Inf"}} 3
 testhistogram_sum 7.5
 testhistogram_count 3
-# HELP testintcounter A test int counter
-# TYPE testintcounter counter
-testintcounter{{namespace="testnamespace"}} 12345
 # HELP testintgauge A test int gauge
 # TYPE testintgauge gauge
 testintgauge 42
@@ -1031,6 +1028,9 @@ testnamespace_testcounter{{component="testcomponent",endpoint="testendpoint",nam
 # HELP testnamespace_testgauge A test gauge
 # TYPE testnamespace_testgauge gauge
 testnamespace_testgauge{{component="testcomponent",namespace="testnamespace"}} 50000
+# HELP testnamespace_testintcounter A test int counter
+# TYPE testnamespace_testintcounter counter
+testnamespace_testintcounter{{namespace="testnamespace"}} 12345
 "#
         );
 
diff --git a/lib/runtime/tests/soak.rs b/lib/runtime/tests/soak.rs
@@ -13,6 +13,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// cargo test --test soak integration::main --features integration
+//!
+//! It will send a batch of requests to the runtime and measure the throughput.
+//!
+//! It will also measure the latency of the requests.
+//!
+//! A reasonable soak test configuration to start off is 1 minute duration with 10000 batch load:
+//! export DYN_QUEUED_UP_PROCESSING=true
+//! export DYN_SOAK_BATCH_LOAD=10000
+//! export DYN_SOAK_RUN_DURATION=60s
+//! cargo test --test soak integration::main --features integration -- --nocapture
 #[cfg(feature = "integration")]
 mod integration {
 
@@ -22,13 +33,17 @@ mod integration {
         logging,
         pipeline::{
             async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
-            ResponseStream, SingleIn,
+            PushRouter, ResponseStream, SingleIn,
         },
         protocols::annotated::Annotated,
-        DistributedRuntime, ErrorContext, Result, Runtime, Worker,
+        stream, DistributedRuntime, ErrorContext, Result, Runtime, Worker,
     };
     use futures::StreamExt;
-    use std::{sync::Arc, time::Duration};
+    use std::{
+        sync::atomic::{AtomicU64, Ordering},
+        sync::Arc,
+        time::Duration,
+    };
     use tokio::time::Instant;
 
     #[test]
@@ -45,16 +60,29 @@ mod integration {
 
         client.await??;
         distributed.shutdown();
-        server.await??;
+        let handler = server.await??;
+
+        // Print final backend counter value
+        let final_count = handler.backend_counter.load(Ordering::Relaxed);
+        println!(
+            "Final RequestHandler backend_counter: {} requests processed",
+            final_count
+        );
 
         Ok(())
     }
 
-    struct RequestHandler {}
+    struct RequestHandler {
+        backend_counter: AtomicU64,
+        queued_up_processing: bool,
+    }
 
     impl RequestHandler {
-        fn new() -> Arc<Self> {
-            Arc::new(Self {})
+        fn new(queued_up_processing: bool) -> Arc<Self> {
+            Arc::new(Self {
+                backend_counter: AtomicU64::new(0),
+                queued_up_processing,
+            })
         }
     }
 
@@ -63,25 +91,40 @@ mod integration {
         async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
             let (data, ctx) = input.into_parts();
 
+            // Increment backend counter
+            self.backend_counter.fetch_add(1, Ordering::Relaxed);
+
             let chars = data
                 .chars()
                 .map(|c| Annotated::from_data(c.to_string()))
                 .collect::<Vec<_>>();
 
-            let stream = async_stream::stream! {
-                for c in chars {
-                    yield c;
-                    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
-                }
-            };
-
-            Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
+            if self.queued_up_processing {
+                // queued up processing - delayed response to saturate the queue
+                let async_stream = async_stream::stream! {
+                    for c in chars {
+                        yield c;
+                        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+                    }
+                };
+                Ok(ResponseStream::new(Box::pin(async_stream), ctx.context()))
+            } else {
+                // normal processing - immediate response
+                let iter_stream = stream::iter(chars);
+                Ok(ResponseStream::new(Box::pin(iter_stream), ctx.context()))
+            }
         }
     }
 
-    async fn backend(runtime: DistributedRuntime) -> Result<()> {
+    async fn backend(runtime: DistributedRuntime) -> Result<Arc<RequestHandler>> {
+        // get the queued up processing setting from env (not delayed)
+        let queued_up_processing =
+            std::env::var("DYN_QUEUED_UP_PROCESSING").unwrap_or("false".to_string());
+        let queued_up_processing: bool = queued_up_processing.parse().unwrap_or(false);
+
         // attach an ingress to an engine
-        let ingress = Ingress::for_engine(RequestHandler::new())?;
+        let handler = RequestHandler::new(queued_up_processing);
+        let ingress = Ingress::for_engine(handler.clone())?;
 
         // // make the ingress discoverable via a component service
         // // we must first create a service, then we can attach one more more endpoints
@@ -95,39 +138,44 @@ mod integration {
             .endpoint_builder()
             .handler(ingress)
             .start()
-            .await
+            .await?;
+
+        Ok(handler)
     }
 
     async fn client(runtime: DistributedRuntime) -> Result<()> {
         // get the run duration from env
-        let run_duration = std::env::var("DYN_SOAK_RUN_DURATION").unwrap_or("1m".to_string());
+        let run_duration = std::env::var("DYN_SOAK_RUN_DURATION").unwrap_or("3s".to_string());
         let run_duration =
-            humantime::parse_duration(&run_duration).unwrap_or(Duration::from_secs(60));
+            humantime::parse_duration(&run_duration).unwrap_or(Duration::from_secs(3));
 
-        let batch_load = std::env::var("DYN_SOAK_BATCH_LOAD").unwrap_or("10000".to_string());
-        let batch_load: usize = batch_load.parse().unwrap_or(10000);
+        let batch_load = std::env::var("DYN_SOAK_BATCH_LOAD").unwrap_or("100".to_string());
+        let batch_load: usize = batch_load.parse().unwrap_or(100);
 
         let client = runtime
             .namespace(DEFAULT_NAMESPACE)?
             .component("backend")?
             .endpoint("generate")
-            .client::<String, Annotated<String>>()
+            .client()
             .await?;
 
         client.wait_for_instances().await?;
-        let client = Arc::new(client);
+        let router =
+            PushRouter::<String, Annotated<String>>::from_client(client, Default::default())
+                .await?;
+        let router = Arc::new(router);
 
         let start = Instant::now();
         let mut count = 0;
 
         loop {
             let mut tasks = Vec::new();
             for _ in 0..batch_load {
-                let client = client.clone();
+                let router = router.clone();
                 tasks.push(tokio::spawn(async move {
                     let mut stream = tokio::time::timeout(
-                        Duration::from_secs(30),
-                        client.random("hello world".to_string().into()),
+                        Duration::from_secs(5),
+                        router.random("hello world".to_string().into()),
                     )
                     .await
                     .context("request timed out")??;
@@ -147,7 +195,9 @@ mod integration {
 
             let elapsed = start.elapsed();
             count += batch_load;
-            println!("elapsed: {:?}; count: {}", elapsed, count);
+            if count % 1000 == 0 {
+                println!("elapsed: {:?}; count: {}", elapsed, count);
+            }
 
             if elapsed > run_duration {
                 println!("done");