feat(tests): e2e

AlonKellner-RedHat · AlonKellner-RedHat · commit be46a0933089 · 2025-08-13T15:48:46.000Z
diff --git a/tests/e2e/e2e/__init__.py b/tests/e2e/e2e/__init__.py
diff --git a/tests/e2e/e2e/e2e/README.md b/tests/e2e/e2e/e2e/README.md
@@ -0,0 +1,7 @@
+# E2E tests
+
+The E2E tests in GuideLLM use the [vLLM simulator by llm-d](https://llm-d.ai/docs/architecture/Components/inf-simulator), to run them run the following command:
+
+```shell
+docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./
+```
diff --git a/tests/e2e/e2e/e2e/__init__.py b/tests/e2e/e2e/e2e/__init__.py
diff --git a/tests/e2e/e2e/e2e/test_max_error_benchmark.py b/tests/e2e/e2e/e2e/test_max_error_benchmark.py
@@ -0,0 +1,92 @@
+# test_server_interaction.py
+
+import json
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+from loguru import logger
+
+from tests.e2e.vllm_sim_server import VllmSimServer
+
+
+@pytest.fixture(scope="module")
+def server():
+    """
+    Pytest fixture to start and stop the server for the entire module
+    using the TestServer class.
+    """
+    server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b", mode="echo")
+    try:
+        server.start()
+        yield server  # Yield the URL for tests to use
+    finally:
+        server.stop()  # Teardown: Stop the server after tests are done
+
+
+@pytest.mark.timeout(30)
+def test_max_error_benchmark(server: VllmSimServer):
+    """
+    Another example test interacting with the server.
+    """
+    report_path = Path("tests/e2e/max_error_benchmarks.json")
+    rate = 10
+    max_error_rate = 0.1
+    command = f"""guidellm benchmark \
+  --target "{server.get_url()}" \
+  --rate-type constant \
+  --rate {rate} \
+  --max-seconds 60 \
+  --max-error {max_error_rate} \
+  --data "prompt_tokens=256,output_tokens=128" \
+  --output-path {report_path}
+              """
+    logger.info(f"Client command: {command}")
+    process = subprocess.Popen(  # noqa: S603
+        ["/bin/bash", "-c", command],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    logger.info("Waiting for client to start...")
+    time.sleep(10)
+    server.stop()
+
+    try:
+        logger.info("Fetching client output")
+        stdout, stderr = process.communicate()
+        logger.debug(f"Client stdout:\n{stdout}")
+        logger.debug(f"Client stderr:\n{stderr}")
+
+        assert report_path.exists()
+        with report_path.open("r") as f:
+            report = json.load(f)
+
+        assert "benchmarks" in report
+        benchmarks = report["benchmarks"]
+        assert len(benchmarks) > 0
+        benchmark = benchmarks[0]
+        assert "run_stats" in benchmark
+        run_stats = benchmark["run_stats"]
+        assert "status" in run_stats
+        status = run_stats["status"]
+        assert status == "error"
+        assert "termination_reason" in run_stats
+        termination_reason = run_stats["termination_reason"]
+        assert termination_reason == "max_error_reached"
+        assert "window_error_rate" in run_stats
+        window_error_rate = run_stats["window_error_rate"]
+        assert window_error_rate > max_error_rate
+    finally:
+        process.terminate()  # Send SIGTERM
+        try:
+            process.wait(timeout=5)  # Wait for the process to terminate
+            logger.info("Client stopped successfully.")
+        except subprocess.TimeoutExpired:
+            logger.warning("Client did not terminate gracefully, killing it...")
+            process.kill()  # Send SIGKILL if it doesn't terminate
+            process.wait()
+
+    if report_path.exists():
+        report_path.unlink()
diff --git a/tests/e2e/e2e/e2e/test_placeholder.py b/tests/e2e/e2e/e2e/test_placeholder.py
@@ -0,0 +1,6 @@
+import pytest
+
+
+@pytest.mark.smoke
+def test_placeholder():
+    assert True
diff --git a/tests/e2e/e2e/e2e/test_successful_benchmark.py b/tests/e2e/e2e/e2e/test_successful_benchmark.py
@@ -0,0 +1,118 @@
+# test_server_interaction.py
+
+import json
+import os
+from pathlib import Path
+
+import pytest
+from loguru import logger
+
+from tests.e2e.vllm_sim_server import VllmSimServer
+
+
+@pytest.fixture(scope="module")
+def server():
+    """
+    Pytest fixture to start and stop the server for the entire module
+    using the TestServer class.
+    """
+    server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b", mode="echo")
+    try:
+        server.start()
+        yield server  # Yield the URL for tests to use
+    finally:
+        server.stop()  # Teardown: Stop the server after tests are done
+
+
+@pytest.mark.timeout(30)
+def test_max_seconds_benchmark(server: VllmSimServer):
+    """
+    Another example test interacting with the server.
+    """
+    report_path = Path("tests/e2e/max_duration_benchmarks.json")
+    rate = 10
+    command = f"""
+guidellm benchmark \
+  --target "{server.get_url()}" \
+  --rate-type constant \
+  --rate {rate} \
+  --max-seconds 1 \
+  --data "prompt_tokens=256,output_tokens=128" \
+  --output-path {report_path}
+              """
+
+    logger.info(f"Client command: {command}")
+    os.system(command)  # noqa: S605
+
+    assert report_path.exists()
+    with report_path.open("r") as f:
+        report = json.load(f)
+
+    assert "benchmarks" in report
+    benchmarks = report["benchmarks"]
+    assert len(benchmarks) > 0
+    benchmark = benchmarks[0]
+    assert "requests" in benchmark
+    requests = benchmark["requests"]
+    assert "successful" in requests
+    successful = requests["successful"]
+    assert len(successful) > rate
+
+    assert "run_stats" in benchmark
+    run_stats = benchmark["run_stats"]
+    assert "status" in run_stats
+    status = run_stats["status"]
+    assert status == "success"
+    assert "termination_reason" in run_stats
+    termination_reason = run_stats["termination_reason"]
+    assert termination_reason == "max_seconds_reached"
+
+    if report_path.exists():
+        report_path.unlink()
+
+
+@pytest.mark.timeout(30)
+def test_max_requests_benchmark(server: VllmSimServer):
+    """
+    Another example test interacting with the server.
+    """
+    report_path = Path("tests/e2e/max_number_benchmarks.json")
+    rate = 10
+    command = f"""
+guidellm benchmark \
+  --target "{server.get_url()}" \
+  --rate-type constant \
+  --rate {rate} \
+  --max-requests {rate} \
+  --data "prompt_tokens=256,output_tokens=128" \
+  --output-path {report_path}
+              """
+
+    logger.info(f"Client command: {command}")
+    os.system(command)  # noqa: S605
+
+    assert report_path.exists()
+    with report_path.open("r") as f:
+        report = json.load(f)
+
+    assert "benchmarks" in report
+    benchmarks = report["benchmarks"]
+    assert len(benchmarks) > 0
+    benchmark = benchmarks[0]
+    assert "requests" in benchmark
+    requests = benchmark["requests"]
+    assert "successful" in requests
+    successful = requests["successful"]
+    assert len(successful) == rate
+
+    assert "run_stats" in benchmark
+    run_stats = benchmark["run_stats"]
+    assert "status" in run_stats
+    status = run_stats["status"]
+    assert status == "success"
+    assert "termination_reason" in run_stats
+    termination_reason = run_stats["termination_reason"]
+    assert termination_reason == "max_requests_reached"
+
+    if report_path.exists():
+        report_path.unlink()
diff --git a/tests/e2e/e2e/e2e/vllm-sim.Dockerfile b/tests/e2e/e2e/e2e/vllm-sim.Dockerfile
@@ -0,0 +1,14 @@
+FROM golang AS base
+
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y libzmq3-dev pkg-config && \
+    git clone https://github.com/llm-d/llm-d-inference-sim.git && \
+    cd llm-d-inference-sim && \
+    make build
+
+WORKDIR /app/llm-d-inference-sim
+
+FROM scratch
+COPY --from=base /app/llm-d-inference-sim/bin/llm-d-inference-sim /bin/llm-d-inference-sim
diff --git a/tests/e2e/e2e/e2e/vllm_sim_server.py b/tests/e2e/e2e/e2e/vllm_sim_server.py
@@ -0,0 +1,138 @@
+import subprocess
+import time
+from pathlib import Path
+from typing import Optional
+
+import pytest
+import requests
+from loguru import logger
+
+
+class VllmSimServer:
+    """
+    [vLLM simulator](https://llm-d.ai/docs/architecture/Components/inf-simulator)
+    A vLLM simulator wrapper for pytest.
+    """
+
+    def __init__(
+        self,
+        port: int,
+        model: str,
+        lora: Optional[list[str]] = None,
+        mode: Optional[str] = None,
+        echo: Optional[bool] = None,
+        random: Optional[bool] = None,
+        time_to_first_token: Optional[float] = None,
+        inter_token_latency: Optional[float] = None,
+        max_loras: Optional[int] = None,
+        max_cpu_loras: Optional[int] = None,
+        max_running_requests: Optional[int] = None,
+    ):
+        self.port = port
+        self.model = model
+        self.lora = lora
+        self.mode = mode
+        self.echo = echo
+        self.random = random
+        self.time_to_first_token = time_to_first_token
+        self.inter_token_latency = inter_token_latency
+        self.max_loras = max_loras
+        self.max_cpu_loras = max_cpu_loras
+        self.max_running_requests = max_running_requests
+        self.server_url = f"http://127.0.0.1:{self.port}"
+        self.health_url = f"{self.server_url}/health"
+        self.app_script = "./bin/llm-d-inference-sim"
+        self.process: Optional[subprocess.Popen] = None
+        if not Path(self.app_script).exists():
+            message = (
+                "The vLLM simulator binary is required for E2E tests, but is missing.\n"
+                "To build it and enable E2E tests, please run:\n"
+                "docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./"
+            )
+            logger.warning(message)
+            pytest.skip("vLLM simlator binary missing", allow_module_level=True)
+
+    def get_cli_parameters(self) -> list[str]:
+        parameters = ["--port", f"{self.port}", "--model", self.model]
+        if self.lora is not None:
+            parameters.extend(["--lora", ",".join(self.lora)])
+        if self.mode is not None:
+            parameters.extend(["--mode", self.mode])
+        if self.echo is not None:
+            parameters.extend(["--echo"])
+        if self.random is not None:
+            parameters.extend(["--random"])
+        if self.time_to_first_token is not None:
+            parameters.extend(["--time-to-first-token", f"{self.time_to_first_token}"])
+        if self.inter_token_latency is not None:
+            parameters.extend(["--inter-token-latency", f"{self.inter_token_latency}"])
+        if self.max_loras is not None:
+            parameters.extend(["--max-loras", f"{self.max_loras}"])
+        if self.max_cpu_loras is not None:
+            parameters.extend(["--max-cpu-loras", f"{self.max_cpu_loras}"])
+        if self.max_running_requests is not None:
+            parameters.extend(
+                ["--max-running-requests", f"{self.max_running_requests}"]
+            )
+        return parameters
+
+    def start(self):
+        """
+        Starts the server process and waits for it to become healthy.
+        """
+
+        logger.info(f"Starting server on {self.server_url} using {self.app_script}...")
+        cli_parameters = self.get_cli_parameters()
+        command = " ".join([self.app_script, *cli_parameters])
+        logger.info(f"Server command: {command}")
+        self.process = subprocess.Popen(  # noqa: S603
+            [self.app_script, *cli_parameters],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,  # Decode stdout/stderr as text
+        )
+
+        # Wait for the server to start and become healthy
+        max_retries = 20
+        retry_delay_sec = 0.5
+        for i in range(max_retries):
+            try:
+                response = requests.get(self.health_url, timeout=1)
+                if response.status_code == 200:
+                    logger.info(f"Server started successfully at {self.server_url}")
+                    return
+                else:
+                    logger.warning(f"Got response with status: {response.status_code}")
+                    logger.warning(response.json())
+            except requests.ConnectionError:
+                logger.warning(f"Waiting for server... (attempt {i + 1}/{max_retries})")
+                time.sleep(retry_delay_sec)
+        # If the loop completes without breaking, the server didn't start
+        stdout, stderr = self.process.communicate()
+        logger.error(f"Server failed to start after {max_retries} retries.")
+        logger.error(f"Server stdout:\n{stdout}")
+        logger.error(f"Server stderr:\n{stderr}")
+        self.stop()  # Attempt to clean up
+        pytest.fail("Server did not start within the expected time.")
+
+    def stop(self):
+        """
+        Stops the server process.
+        """
+        if self.process:
+            logger.info(f"Stopping server on {self.server_url}...")
+            self.process.terminate()  # Send SIGTERM
+            try:
+                self.process.wait(timeout=1)  # Wait for the process to terminate
+                logger.info("Server stopped successfully.")
+            except subprocess.TimeoutExpired:
+                logger.warning("Server did not terminate gracefully, killing it...")
+                self.process.kill()  # Send SIGKILL if it doesn't terminate
+                self.process.wait()
+            self.process = None  # Clear the process reference
+
+    def get_url(self):
+        """
+        Returns the base URL of the running server.
+        """
+        return self.server_url
diff --git a/tests/e2e/e2e/test_placeholder.py b/tests/e2e/e2e/test_placeholder.py
@@ -0,0 +1,6 @@
+import pytest
+
+
+@pytest.mark.smoke
+def test_placeholder():
+    assert True