diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp index fb0153f5ff8..151b33b1195 100644 --- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp @@ -57,8 +57,8 @@ void initBindings(nb::module_& m) using GenLlmReq = tb::GenericLlmRequest; // Create and register exceptions in module scope - nb::exception(m, "PeftTaskNotCachedException"); - nb::exception(m, "LoraCacheFullException"); + static nb::object peft_exc = nb::exception(m, "PeftTaskNotCachedException"); + static nb::object lora_exc = nb::exception(m, "LoraCacheFullException"); // Register with no captures nb::register_exception_translator( @@ -71,11 +71,11 @@ void initBindings(nb::module_& m) } catch (const tb::PeftTaskNotCachedException& e) { - PyErr_SetString(nb::type().ptr(), e.what()); + PyErr_SetString(peft_exc.ptr(), e.what()); } catch (const tr::LoraCacheFullException& e) { - PyErr_SetString(nb::type().ptr(), e.what()); + PyErr_SetString(lora_exc.ptr(), e.what()); } }); diff --git a/cpp/tensorrt_llm/nanobind/executor/request.cpp b/cpp/tensorrt_llm/nanobind/executor/request.cpp index e2ed1fb2d19..80b9b52bd9d 100644 --- a/cpp/tensorrt_llm/nanobind/executor/request.cpp +++ b/cpp/tensorrt_llm/nanobind/executor/request.cpp @@ -210,10 +210,21 @@ void initRequestBindings(nb::module_& m) nb::cast>>(state[6])); }; nb::class_(m, "OutputConfig") - .def(nb::init>>(), - nb::arg("return_log_probs").none() = false, nb::arg("return_context_logits") = false, - nb::arg("return_generation_logits") = false, nb::arg("exclude_input_from_output") = false, - nb::arg("return_encoder_output") = false, nb::arg("return_perf_metrics") = false, + .def( + "__init__", + [](tle::OutputConfig& self, std::optional return_log_probs, std::optional return_context_logits, + std::optional return_generation_logits, std::optional exclude_input_from_output, + std::optional return_encoder_output, std::optional return_perf_metrics, + std::optional> additional_model_outputs) + { + new (&self) tle::OutputConfig(return_log_probs.value_or(false), return_context_logits.value_or(false), + return_generation_logits.value_or(false), exclude_input_from_output.value_or(false), + return_encoder_output.value_or(false), return_perf_metrics.value_or(false), + additional_model_outputs); + }, + nb::arg("return_log_probs") = nb::none(), nb::arg("return_context_logits") = nb::none(), + nb::arg("return_generation_logits") = nb::none(), nb::arg("exclude_input_from_output") = nb::none(), + nb::arg("return_encoder_output") = nb::none(), nb::arg("return_perf_metrics") = nb::none(), nb::arg("additional_model_outputs") = nb::none()) .def_rw("return_log_probs", &tle::OutputConfig::returnLogProbs) .def_rw("return_context_logits", &tle::OutputConfig::returnContextLogits) diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp index 1153ca13a8e..87f32635866 100644 --- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp +++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp @@ -424,7 +424,7 @@ void initConfigBindings(pybind11::module_& m) .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI) .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX) .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL) - .def(py::init( + .def("from_string", [](std::string const& str) { if (str == "DEFAULT" || str == "default") @@ -436,9 +436,7 @@ void initConfigBindings(pybind11::module_& m) if (str == "NIXL" || str == "nixl") return tle::CacheTransceiverConfig::BackendType::NIXL; throw std::runtime_error("Invalid backend type: " + str); - })); - - py::implicitly_convertible(); + }); py::class_(m, "CacheTransceiverConfig") .def(py::init, std::optional>(), diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 125a652d800..6614391b452 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -30,6 +30,7 @@ # isort: off from ..bindings.executor import ( BatchingType as _BatchingType, + CacheTransceiverBackendType as _CacheTransceiverBackendType, CacheTransceiverConfig as _CacheTransceiverConfig, CapacitySchedulerPolicy as _CapacitySchedulerPolicy, ContextChunkingPolicy as _ContextChunkingPolicy, @@ -871,7 +872,7 @@ class CacheTransceiverConfig(BaseModel, PybindMirror): def _to_pybind(self): return _CacheTransceiverConfig( - backend=self.backend, + backend=_CacheTransceiverBackendType.from_string(self.backend), max_tokens_in_buffer=self.max_tokens_in_buffer) diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py index 84594cd473f..4a6545beef9 100644 --- a/tensorrt_llm/serve/openai_protocol.py +++ b/tensorrt_llm/serve/openai_protocol.py @@ -252,7 +252,7 @@ def to_sampling_params(self) -> SamplingParams: add_special_tokens=self.add_special_tokens, # TODO: migrate to use logprobs and prompt_logprobs - _return_log_probs=self.logprobs, + _return_log_probs=bool(self.logprobs), ) return sampling_params @@ -543,7 +543,7 @@ def to_sampling_params(self) -> SamplingParams: add_special_tokens=self.add_special_tokens, # TODO: migrate to use logprobs and prompt_logprobs - _return_log_probs=self.logprobs, + _return_log_probs=bool(self.logprobs), ) return sampling_params