Skip to content

Commit fc12436

Browse files
authored
feat(frontend): router-mode settings (#2001)
1 parent dc75cf1 commit fc12436

File tree

11 files changed

+145
-34
lines changed

11 files changed

+145
-34
lines changed

components/frontend/src/dynamo/frontend/main.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,15 @@
1414

1515
import uvloop
1616

17-
from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
17+
from dynamo.llm import (
18+
EngineType,
19+
EntrypointArgs,
20+
KvRouterConfig,
21+
RouterConfig,
22+
RouterMode,
23+
make_engine,
24+
run_input,
25+
)
1826
from dynamo.runtime import DistributedRuntime
1927

2028

@@ -32,19 +40,68 @@ def parse_args():
3240
parser.add_argument(
3341
"--http-port", type=int, default=8080, help="HTTP port for the engine (u16)."
3442
)
43+
parser.add_argument(
44+
"--router-mode",
45+
type=str,
46+
choices=["round-robin", "random", "kv"],
47+
default="round-robin",
48+
help="How to route the request",
49+
)
50+
parser.add_argument(
51+
"--kv-overlap-score-weight",
52+
type=float,
53+
default=1.0,
54+
help="KV Router: Weight for overlap score in worker selection. Higher values prioritize KV cache reuse.",
55+
)
56+
parser.add_argument(
57+
"--router-temperature",
58+
type=float,
59+
default=0.0,
60+
help="KV Router: Temperature for worker sampling via softmax. Higher values promote more randomness, and 0 fallbacks to deterministic.",
61+
)
62+
parser.add_argument(
63+
"--kv-events",
64+
action="store_true",
65+
dest="use_kv_events",
66+
help=" KV Router: Whether to use KV events to maintain the view of cached blocks. If false, would use ApproxKvRouter for predicting block creation / deletion based only on incoming requests at a timer.",
67+
)
68+
parser.add_argument(
69+
"--no-kv-events",
70+
action="store_false",
71+
dest="use_kv_events",
72+
help=" KV Router. Disable KV events.",
73+
)
74+
parser.set_defaults(use_kv_events=True)
75+
3576
return parser.parse_args()
3677

3778

3879
async def async_main():
3980
runtime = DistributedRuntime(asyncio.get_running_loop(), False)
4081
flags = parse_args()
4182

83+
if flags.router_mode == "kv":
84+
router_mode = RouterMode.KV
85+
kv_router_config = KvRouterConfig(
86+
overlap_score_weight=flags.kv_overlap_score_weight,
87+
router_temperature=flags.router_temperature,
88+
use_kv_events=flags.use_kv_events,
89+
)
90+
elif flags.router_mode == "random":
91+
router_mode = RouterMode.Random
92+
kv_router_config = None
93+
else:
94+
router_mode = RouterMode.RoundRobin
95+
kv_router_config = None
96+
97+
kwargs = {
98+
"http_port": flags.http_port,
99+
"kv_cache_block_size": flags.kv_cache_block_size,
100+
"router_config": RouterConfig(router_mode, kv_router_config),
101+
}
102+
42103
# out=dyn
43-
e = EntrypointArgs(
44-
EngineType.Dynamic,
45-
http_port=flags.http_port,
46-
kv_cache_block_size=flags.kv_cache_block_size,
47-
)
104+
e = EntrypointArgs(EngineType.Dynamic, **kwargs)
48105
engine = await make_engine(runtime, e)
49106

50107
try:

launch/dynamo-run/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ pub async fn run(
4444
// Only set if user provides. Usually loaded from tokenizer_config.json
4545
.context_length(flags.context_length)
4646
.http_port(Some(flags.http_port))
47-
.router_config(flags.router_config())
47+
.router_config(Some(flags.router_config()))
4848
.request_template(flags.request_template.clone());
4949

5050
// If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.

lib/bindings/python/rust/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
7878
m.add_class::<llm::entrypoint::EntrypointArgs>()?;
7979
m.add_class::<llm::entrypoint::EngineConfig>()?;
8080
m.add_class::<llm::entrypoint::EngineType>()?;
81+
m.add_class::<llm::entrypoint::RouterConfig>()?;
82+
m.add_class::<llm::entrypoint::KvRouterConfig>()?;
8183
m.add_class::<llm::kv::WorkerMetricsPublisher>()?;
8284
m.add_class::<llm::model_card::ModelDeploymentCard>()?;
8385
m.add_class::<llm::preprocessor::OAIChatPreprocessor>()?;
@@ -160,7 +162,7 @@ fn register_llm<'p>(
160162
.model_name(model_name)
161163
.context_length(context_length)
162164
.kv_cache_block_size(kv_cache_block_size)
163-
.router_config(router_config);
165+
.router_config(Some(router_config));
164166
// Download from HF, load the ModelDeploymentCard
165167
let mut local_model = builder.build().await.map_err(to_pyerr)?;
166168
// Advertise ourself on etcd so ingress can find us

lib/bindings/python/rust/llm/entrypoint.rs

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@ use pyo3::{exceptions::PyException, prelude::*};
88

99
use dynamo_llm::entrypoint::input::Input;
1010
use dynamo_llm::entrypoint::EngineConfig as RsEngineConfig;
11+
use dynamo_llm::entrypoint::RouterConfig as RsRouterConfig;
12+
use dynamo_llm::kv_router::KvRouterConfig as RsKvRouterConfig;
1113
use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
1214
use dynamo_llm::mocker::protocols::MockEngineArgs;
1315
use dynamo_runtime::protocols::Endpoint as EndpointId;
1416

17+
use crate::RouterMode;
18+
1519
#[pyclass(eq, eq_int)]
1620
#[derive(Clone, Debug, PartialEq)]
1721
#[repr(i32)]
@@ -21,6 +25,56 @@ pub enum EngineType {
2125
Mocker = 3,
2226
}
2327

28+
#[pyclass]
29+
#[derive(Default, Clone, Debug, Copy)]
30+
pub struct KvRouterConfig {
31+
inner: RsKvRouterConfig,
32+
}
33+
34+
#[pymethods]
35+
impl KvRouterConfig {
36+
#[new]
37+
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true))]
38+
fn new(overlap_score_weight: f64, router_temperature: f64, use_kv_events: bool) -> Self {
39+
KvRouterConfig {
40+
inner: RsKvRouterConfig {
41+
overlap_score_weight,
42+
router_temperature,
43+
use_kv_events,
44+
..Default::default()
45+
},
46+
}
47+
}
48+
}
49+
50+
#[pyclass]
51+
#[derive(Clone, Debug)]
52+
pub struct RouterConfig {
53+
router_mode: RouterMode,
54+
kv_router_config: KvRouterConfig,
55+
}
56+
57+
#[pymethods]
58+
impl RouterConfig {
59+
#[new]
60+
#[pyo3(signature = (mode, config=None))]
61+
pub fn new(mode: RouterMode, config: Option<KvRouterConfig>) -> Self {
62+
Self {
63+
router_mode: mode,
64+
kv_router_config: config.unwrap_or_default(),
65+
}
66+
}
67+
}
68+
69+
impl From<RouterConfig> for RsRouterConfig {
70+
fn from(rc: RouterConfig) -> RsRouterConfig {
71+
RsRouterConfig {
72+
router_mode: rc.router_mode.into(),
73+
kv_router_config: rc.kv_router_config.inner,
74+
}
75+
}
76+
}
77+
2478
#[pyclass]
2579
#[derive(Clone, Debug)]
2680
pub(crate) struct EntrypointArgs {
@@ -31,7 +85,7 @@ pub(crate) struct EntrypointArgs {
3185
endpoint_id: Option<EndpointId>,
3286
context_length: Option<u32>,
3387
template_file: Option<PathBuf>,
34-
//router_config: Option<RouterConfig>,
88+
router_config: Option<RouterConfig>,
3589
kv_cache_block_size: Option<u32>,
3690
http_port: Option<u16>,
3791
extra_engine_args: Option<PathBuf>,
@@ -41,7 +95,7 @@ pub(crate) struct EntrypointArgs {
4195
impl EntrypointArgs {
4296
#[allow(clippy::too_many_arguments)]
4397
#[new]
44-
#[pyo3(signature = (engine_type, model_path=None, model_name=None, model_config=None, endpoint_id=None, context_length=None, template_file=None, kv_cache_block_size=None, http_port=None, extra_engine_args=None))]
98+
#[pyo3(signature = (engine_type, model_path=None, model_name=None, model_config=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_port=None, extra_engine_args=None))]
4599
pub fn new(
46100
engine_type: EngineType,
47101
model_path: Option<PathBuf>,
@@ -50,7 +104,7 @@ impl EntrypointArgs {
50104
endpoint_id: Option<String>,
51105
context_length: Option<u32>,
52106
template_file: Option<PathBuf>,
53-
//router_config: Option<RouterConfig>,
107+
router_config: Option<RouterConfig>,
54108
kv_cache_block_size: Option<u32>,
55109
http_port: Option<u16>,
56110
extra_engine_args: Option<PathBuf>,
@@ -71,7 +125,7 @@ impl EntrypointArgs {
71125
endpoint_id: endpoint_id_obj,
72126
context_length,
73127
template_file,
74-
//router_config,
128+
router_config,
75129
kv_cache_block_size,
76130
http_port,
77131
extra_engine_args,
@@ -101,6 +155,7 @@ pub fn make_engine<'p>(
101155
.context_length(args.context_length)
102156
.request_template(args.template_file.clone())
103157
.kv_cache_block_size(args.kv_cache_block_size)
158+
.router_config(args.router_config.clone().map(|rc| rc.into()))
104159
.http_port(args.http_port);
105160
pyo3_async_runtimes::tokio::future_into_py(py, async move {
106161
let local_model = builder.build().await.map_err(to_pyerr)?;

lib/bindings/python/src/dynamo/_core.pyi

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,5 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
3-
#
4-
# Licensed under the Apache License, Version 2.0 (the "License");
5-
# you may not use this file except in compliance with the License.
6-
# You may obtain a copy of the License at
7-
#
8-
# http://www.apache.org/licenses/LICENSE-2.0
9-
#
10-
# Unless required by applicable law or agreed to in writing, software
11-
# distributed under the License is distributed on an "AS IS" BASIS,
12-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
# See the License for the specific language governing permissions and
14-
# limitations under the License.
153

164
from typing import (
175
Any,
@@ -831,9 +819,15 @@ class ModelType:
831819

832820
class RouterMode:
833821
"""Router mode for load balancing requests across workers"""
834-
RoundRobin: 'RouterMode'
835-
Random: 'RouterMode'
836-
KV: 'RouterMode'
822+
...
823+
824+
class RouterConfig:
825+
"""How to route the request"""
826+
...
827+
828+
class KvRouterConfig:
829+
"""Values for KV router"""
830+
...
837831

838832
async def register_llm(model_type: ModelType, endpoint: Endpoint, model_path: str, model_name: Optional[str] = None, context_length: Optional[int] = None, kv_cache_block_size: Optional[int] = None, router_mode: Optional[RouterMode] = None) -> None:
839833
"""Attach the model at path to the given endpoint, and advertise it as model_type"""

lib/bindings/python/src/dynamo/llm/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,13 @@
2424
from dynamo._core import KvIndexer as KvIndexer
2525
from dynamo._core import KvMetricsAggregator as KvMetricsAggregator
2626
from dynamo._core import KvRecorder as KvRecorder
27+
from dynamo._core import KvRouterConfig as KvRouterConfig
2728
from dynamo._core import KvStats as KvStats
2829
from dynamo._core import ModelType as ModelType
2930
from dynamo._core import OverlapScores as OverlapScores
3031
from dynamo._core import RadixTree as RadixTree
32+
from dynamo._core import RouterConfig as RouterConfig
33+
from dynamo._core import RouterMode as RouterMode
3134
from dynamo._core import SpecDecodeStats as SpecDecodeStats
3235
from dynamo._core import WorkerMetricsPublisher as WorkerMetricsPublisher
3336
from dynamo._core import WorkerStats as WorkerStats

lib/llm/src/discovery/model_manager.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ impl ModelManager {
212212
kv_cache_block_size: u32,
213213
kv_router_config: Option<KvRouterConfig>,
214214
) -> anyhow::Result<Arc<KvRouter>> {
215-
let selector = Box::new(DefaultWorkerSelector::new(kv_router_config.clone()));
215+
let selector = Box::new(DefaultWorkerSelector::new(kv_router_config));
216216
let chooser = KvRouter::new(
217217
component.clone(),
218218
kv_cache_block_size,

lib/llm/src/discovery/watcher.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ impl ModelWatcher {
220220
&model_entry.name,
221221
&component,
222222
card.kv_cache_block_size,
223-
self.kv_router_config.clone(),
223+
self.kv_router_config,
224224
)
225225
.await?;
226226
let kv_push_router = KvPushRouter::new(router, chooser);
@@ -261,7 +261,7 @@ impl ModelWatcher {
261261
&model_entry.name,
262262
&component,
263263
card.kv_cache_block_size,
264-
self.kv_router_config.clone(),
264+
self.kv_router_config,
265265
)
266266
.await?;
267267
let kv_push_router = KvPushRouter::new(router, chooser);

lib/llm/src/entrypoint/input/http.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ pub async fn run(runtime: Runtime, engine_config: EngineConfig) -> anyhow::Resul
4242
etcd_client.clone(),
4343
MODEL_ROOT_PATH,
4444
router_config.router_mode,
45-
Some(router_config.kv_router_config.clone()),
45+
Some(router_config.kv_router_config),
4646
)
4747
.await?;
4848
}

lib/llm/src/kv_router.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ pub trait WorkerSelector {
6262
}
6363

6464
/// KV Router configuration parameters
65-
#[derive(Debug, Clone)]
65+
#[derive(Debug, Clone, Copy)]
6666
pub struct KvRouterConfig {
6767
pub overlap_score_weight: f64,
6868

0 commit comments

Comments
 (0)