ai-dynamo · grahamking · Jul 22, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/examples/vllm/README.md → components/backends/vllm/README.md b/examples/vllm/README.md → components/backends/vllm/README.md
@@ -1,23 +1,11 @@
 <!--
 SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
 -->
 
-# LLM Deployment Examples using vLLM
+# LLM Deployment using vLLM
 
-This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using vLLM. For Dynamo integration, we leverage vLLM's native KV cache events, NIXL based transfer mechanisms, and metric reporting to enable KV-aware routing and P/D disaggregation.
+This directory contains a Dynamo vllm engine and reference implementations for deploying Large Language Models (LLMs) in various configurations using vLLM. For Dynamo integration, we leverage vLLM's native KV cache events, NIXL based transfer mechanisms, and metric reporting to enable KV-aware routing and P/D disaggregation.
 
 ## Deployment Architectures
 
@@ -36,11 +24,11 @@ docker compose -f deploy/metrics/docker-compose.yml up -d
 ### Build and Run docker
 
 ```bash
-./container/build.sh
+./container/build.sh --framework VLLM
 ```
 
 ```bash
-./container/run.sh -it [--mount-workspace]
+./container/run.sh -it --framework VLLM [--mount-workspace]
 ```
 
 This includes the specific commit [vllm-project/vllm#19790](https://github.com/vllm-project/vllm/pull/19790) which enables support for external control of the DP ranks.
@@ -74,31 +62,31 @@ Note: The above architecture illustrates all the components. The final component
 
 ```bash
 # requires one gpu
-cd examples/vllm
+cd components/backends/vllm
 bash launch/agg.sh
 ```
 
 #### Aggregated Serving with KV Routing
 
 ```bash
 # requires two gpus
-cd examples/vllm
+cd components/backends/vllm
 bash launch/agg_router.sh
 ```
 
 #### Disaggregated Serving
 
 ```bash
 # requires two gpus
-cd examples/vllm
+cd components/backends/vllm
 bash launch/disagg.sh
 ```
 
 #### Disaggregated Serving with KV Routing
 
 ```bash
 # requires three gpus
-cd examples/vllm
+cd components/backends/vllm
 bash launch/disagg_router.sh
 ```
 
@@ -108,7 +96,7 @@ This example is not meant to be performant but showcases dynamo routing to data
 
 ```bash
 # requires four gpus
-cd examples/vllm
+cd components/backends/vllm
 bash launch/dep.sh
 ```
 
@@ -146,7 +134,7 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director
 Example with disagg:
 
 ```bash
-cd ~/dynamo/examples/vllm/deploy
+cd ~/dynamo/components/backends/vllm/deploy
 kubectl apply -f disagg.yaml
 ```
 

diff --git a/examples/vllm/deepseek-r1.md → components/backends/vllm/deepseek-r1.md b/examples/vllm/deepseek-r1.md → components/backends/vllm/deepseek-r1.md
@@ -1,18 +1,6 @@
 <!--
 SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
 -->
 
 # Running Deepseek R1 with Wide EP
@@ -51,4 +39,4 @@ curl localhost:8080/v1/chat/completions \
     "stream": false,
     "max_tokens": 30
   }'
-```
+```
diff --git a/examples/vllm/deploy/agg.yaml → components/backends/vllm/deploy/agg.yaml b/examples/vllm/deploy/agg.yaml → components/backends/vllm/deploy/agg.yaml
@@ -1,17 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,7 +39,7 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - dynamo
             - run
@@ -94,6 +83,6 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
-            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
diff --git a/examples/vllm/deploy/agg_router.yaml → ...ents/backends/vllm/deploy/agg_router.yaml b/examples/vllm/deploy/agg_router.yaml → ...ents/backends/vllm/deploy/agg_router.yaml
@@ -1,17 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,7 +39,7 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - dynamo
             - run
@@ -96,6 +85,6 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
diff --git a/examples/vllm/deploy/disagg.yaml → components/backends/vllm/deploy/disagg.yaml b/examples/vllm/deploy/disagg.yaml → components/backends/vllm/deploy/disagg.yaml
@@ -1,17 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,7 +39,7 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - dynamo
             - run
@@ -94,7 +83,7 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
     VllmPrefillWorker:
@@ -133,6 +122,6 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
diff --git a/examples/vllm/deploy/disagg_planner.yaml → .../backends/vllm/deploy/disagg_planner.yaml b/examples/vllm/deploy/disagg_planner.yaml → .../backends/vllm/deploy/disagg_planner.yaml
@@ -1,17 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,7 +39,7 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - dynamo
             - run
@@ -94,7 +83,7 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
     VllmPrefillWorker:
@@ -133,6 +122,6 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
             - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
diff --git a/examples/vllm/deploy/disagg_router.yaml → ...s/backends/vllm/deploy/disagg_router.yaml b/examples/vllm/deploy/disagg_router.yaml → ...s/backends/vllm/deploy/disagg_router.yaml
@@ -1,17 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,16 +39,9 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
-            - dynamo
-            - run
-            - in=http
-            - out=dyn
-            - --http-port
-            - "8000"
-            - --router-mode
-            - kv
+            - "python3 -m dynamo.frontend --http-port 8080 --router-mode kv"
     VllmDecodeWorker:
       dynamoNamespace: vllm-v1-disagg-router
       envFromSecret: hf-token-secret
@@ -96,9 +78,9 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
-            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
     VllmPrefillWorker:
       dynamoNamespace: vllm-v1-disagg-router
       envFromSecret: hf-token-secret
@@ -135,6 +117,6 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
           args:
-            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
diff --git a/examples/vllm/launch/agg.sh → components/backends/vllm/launch/agg.sh b/examples/vllm/launch/agg.sh → components/backends/vllm/launch/agg.sh
@@ -5,7 +5,7 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 
 # run ingress
-dynamo run in=http out=dyn &
+python -m dynamo.frontend &
 
 # run worker
-python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching
+python -m dynamo.vllm  --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching
diff --git a/components/backends/vllm/launch/agg_router.sh b/components/backends/vllm/launch/agg_router.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+# run ingress
+python -m dynamo.frontend --router-mode kv &
+
+# run workers
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
+
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager
diff --git a/examples/vllm/launch/dep.sh → components/backends/vllm/launch/dep.sh b/examples/vllm/launch/dep.sh → components/backends/vllm/launch/dep.sh
@@ -5,13 +5,13 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 
 # run ingress
-dynamo run in=http out=dyn --router-mode kv &
+python -m dynamo.frontend --router-mode kv &
 
 # Data Parallel Attention / Expert Parallelism
 # Routing to DP workers managed by Dynamo
 # Chose Qwen3-30B because its a small MOE that can fit on smaller GPUs (L40S for example)
 for i in {0..3}; do
-    CUDA_VISIBLE_DEVICES=$i python3 components/main.py \
+    CUDA_VISIBLE_DEVICES=$i python3 -m dynamo.vllm \
     --model Qwen/Qwen3-30B-A3B \
     --data-parallel-rank $i \
     --data-parallel-size 4 \

diff --git a/examples/vllm/launch/disagg.sh → components/backends/vllm/launch/disagg.sh b/examples/vllm/launch/disagg.sh → components/backends/vllm/launch/disagg.sh
@@ -5,11 +5,11 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 
 # run ingress
-dynamo run in=http out=dyn &
+python -m dynamo.frontend --router-mode kv &
 
-CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
 
-CUDA_VISIBLE_DEVICES=1 python3 components/main.py \
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
     --model Qwen/Qwen3-0.6B \
     --enforce-eager \
     --is-prefill-worker