Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 12 additions & 91 deletions cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class GenericLlmRequest
RequestIdType, TensorPtr&, BeamTokens const&, TStream const&, std::optional<RequestIdType>)>;
using RequestPtr = std::shared_ptr<GenericLlmRequest>;
using MillisecondsType = std::chrono::milliseconds;
using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
using CacheSaltIDType = runtime::CacheSaltIDType;

GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const& inputTokens,
Expand Down Expand Up @@ -138,7 +139,7 @@ class GenericLlmRequest
std::optional<SizeType32> languageAdapterUid = std::nullopt,
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt)
: mRequestId(requestId)
, mPromptLen(inputTokens->size())
, mMaxNewTokens(maxNewTokens)
Expand Down Expand Up @@ -202,7 +203,7 @@ class GenericLlmRequest
mState = LlmRequestState::kENCODER_INIT;
}

initialize(*inputTokens, returnLogProbs);
initialize(*inputTokens, returnLogProbs, arrivalTime);
}

GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens,
Expand Down Expand Up @@ -2054,7 +2055,8 @@ class GenericLlmRequest
std::optional<CacheSaltIDType> mCacheSaltID{std::nullopt};

private:
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
void initialize(
VecTokens const& inputTokens, bool outputLogProbs, std::optional<TimePoint> arrivalTime = std::nullopt)
{
if (mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY)
{
Expand Down Expand Up @@ -2148,7 +2150,7 @@ class GenericLlmRequest

if (mReturnPerfMetrics)
{
mPerfMetrics.timingMetrics.arrivalTime = std::chrono::steady_clock::now();
mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(std::chrono::steady_clock::now());
}
mStartTime = std::chrono::steady_clock::now();
}
Expand Down Expand Up @@ -2197,61 +2199,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
using TokenExtraIdType = Base::TokenExtraIdType;
using VecTokenExtraIds = Base::VecTokenExtraIds;

// 49 parameters, 49 parameters in Base class constructor
LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
std::optional<SizeType32> promptVocabSize = std::nullopt,
std::optional<std::shared_ptr<std::vector<std::vector<SizeType32>>>> multimodalHashes = std::nullopt,
std::optional<std::shared_ptr<std::vector<SizeType32>>> multimodalPositions = std::nullopt,
std::optional<std::shared_ptr<std::vector<SizeType32>>> multimodalLengths = std::nullopt,
std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
std::optional<SizeType32> mropePositionDeltas = std::nullopt,
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
std::optional<TensorPtr> loraConfig = std::nullopt,
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
std::optional<std::shared_ptr<VecTokens>> const& draftTokens = std::nullopt,
std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
bool applyLogitsPostProcessorBatched = false,
std::optional<std::shared_ptr<VecTokens>> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false,
std::optional<RequestIdType> clientId = std::nullopt,
executor::PriorityType priority = executor::Request::kDefaultPriority,
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
std::optional<SizeType32> encoderOutputLength = std::nullopt,
std::optional<TensorPtr> crossAttentionMask = std::nullopt,
LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
SizeType32 numReturnSequences = 1, std::optional<executor::EagleConfig> eagleConfig = std::nullopt,
std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt, bool returnPerfMetrics = false,
std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
std::optional<SizeType32> languageAdapterUid = std::nullopt,
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalHashes),
std::move(multimodalPositions), std::move(multimodalLengths), std::move(multimodalEmbedding),
std::move(mropeRotaryCosSin), mropePositionDeltas, loraTaskId, std::move(loraWeights),
std::move(loraConfig), std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs,
returnContextLogits, returnGenerationLogits, std::move(draftTokens), std::move(draftLogits),
excludeInputFromOutput, std::move(logitsPostProcessor), applyLogitsPostProcessorBatched,
std::move(encoderInputTokens), returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures),
std::move(encoderOutputLength), std::move(crossAttentionMask), llmRequestType,
std::move(inputTokenExtraIds), numReturnSequences, std::move(eagleConfig), std::move(skipCrossAttnBlocks),
returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams,
cacheSaltID)
{
}

// 49 parameters, 49 parameters in Base class constructor
// inherit constructors
using Base::Base;

LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
Expand Down Expand Up @@ -2286,7 +2236,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
std::optional<SizeType32> languageAdapterUid = std::nullopt,
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt)
: Base(requestId, maxNewTokens, std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),
samplingConfig, isStreaming, endId, padId, std::move(embeddingBias), std::move(badWordsList),
std::move(stopWordsList),
Expand Down Expand Up @@ -2316,37 +2266,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
inputTokenExtraIds ? std::make_optional(std::make_shared<VecTokenExtraIds>(std::move(*inputTokenExtraIds)))
: std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt),
numReturnSequences, std::move(eagleConfig), skipCrossAttnBlocks, returnPerfMetrics,
std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID)
{
}

// 32 parameters, 32 parameters in Base class constructor
LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens,
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
std::optional<SizeType32> promptVocabSize = std::nullopt,
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
std::optional<TensorPtr> loraConfig = std::nullopt,
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
bool returnContextLogits = false, bool returnGenerationLogits = false,
std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
bool returnEncoderOutput = false, std::optional<RequestIdType> clientId = std::nullopt,
executor::PriorityType priority = executor::Request::kDefaultPriority, SizeType32 numReturnSequences = 1,
std::optional<SizeType32> languageAdapterUid = std::nullopt,
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
: Base(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, endId, padId,
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
lookaheadConfig, returnLogProbs, returnContextLogits, returnGenerationLogits, std::move(draftTokens),
std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
numReturnSequences, languageAdapterUid, contextPhaseParams, cacheSaltID)
std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID,
arrivalTime)
{
}

Expand Down
11 changes: 7 additions & 4 deletions cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

#include <ATen/ATen.h>
#include <nanobind/nanobind.h>
#include <nanobind/stl/chrono.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/shared_ptr.h>
#include <nanobind/stl/tuple.h>
Expand Down Expand Up @@ -289,7 +290,8 @@ void initBindings(nb::module_& m)
std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
std::optional<executor::ContextPhaseParams> context_phase_params,
std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id)
std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id,
std::optional<tb::LlmRequest::TimePoint> arrival_time)
{
auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
{
Expand Down Expand Up @@ -329,8 +331,8 @@ void initBindings(nb::module_& m)
encoder_input_tokens, return_encoder_output, client_id, priority, encoder_input_features_tensor_ptr,
encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids,
num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics,
guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params,
cache_salt_id};
guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id,
arrival_time};
},
nb::arg("request_id"), nb::arg("max_new_tokens"), nb::arg("input_tokens"), nb::arg("sampling_config"),
nb::arg("is_streaming"), nb::arg("end_id") = std::nullopt, nb::arg("pad_id") = std::nullopt,
Expand All @@ -355,7 +357,8 @@ void initBindings(nb::module_& m)
nb::arg("eagle_config") = std::nullopt, nb::arg("skip_cross_attn_blocks") = std::nullopt,
nb::arg("return_perf_metrics") = false, nb::arg("guided_decoding_params") = std::nullopt,
nb::arg("language_adapter_uid") = std::nullopt, nb::arg("allotted_time_ms") = std::nullopt,
nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt)
nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt,
nb::arg("arrival_time") = std::nullopt)
.def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, nb::arg("vocab_size"))
.def(nb::init<tb::LlmRequest const&>())
.def("validate", &tb::LlmRequest::validate, nb::arg("max_input_len"), nb::arg("max_seq_len"),
Expand Down
3 changes: 2 additions & 1 deletion cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
mLanguageAdapterUid, //
mAllottedTimeMs, //
mContextPhaseParams, //
mCacheSaltID //
mCacheSaltID, //
mPerfMetrics.timingMetrics.arrivalTime //
);
}
6 changes: 4 additions & 2 deletions cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
std::optional<SizeType32> languageAdapterUid = std::nullopt,
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,
std::optional<TimePoint> arrivalTime = std::nullopt)
: Base(requestId, //
maxNewTokens, //
std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)), //
Expand Down Expand Up @@ -147,7 +148,8 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
languageAdapterUid, //
allottedTimeMs, //
contextPhaseParams, //
cacheSaltID //
cacheSaltID, //
arrivalTime //
)
{
}
Expand Down
3 changes: 3 additions & 0 deletions cpp/tensorrt_llm/nanobind/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <nanobind/nanobind.h>
#include <nanobind/operators.h>
#include <nanobind/stl/bind_vector.h>
#include <nanobind/stl/chrono.h>
#include <nanobind/stl/filesystem.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/shared_ptr.h>
Expand Down Expand Up @@ -511,4 +512,6 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, nb::rv_policy::reference);
m.def("ipc_nvls_free", &tr::ipcNvlsFree);
m.def("ipc_nvls_supported", &tr::ipcNvlsSupported);

m.def("steady_clock_now", []() { return std::chrono::steady_clock::now(); });
}
8 changes: 5 additions & 3 deletions cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ void initBindings(pybind11::module_& m)
std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
std::optional<executor::ContextPhaseParams> context_phase_params,
std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id)
std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id,
std::optional<tb::LlmRequest::TimePoint> arrival_time)
{
auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
{
Expand Down Expand Up @@ -336,7 +337,7 @@ void initBindings(pybind11::module_& m)
encoder_input_features_tensor_ptr, encoder_output_length, cross_attention_mask_tensor_ptr,
llm_request_type, input_token_extra_ids, num_return_sequences, eagle_config,
skip_cross_attn_blocks_tensor_ptr, return_perf_metrics, guided_decoding_params,
language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id};
language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id, arrival_time};
}),
py::arg("request_id"), py::arg("max_new_tokens"), py::arg("input_tokens"), py::arg("sampling_config"),
py::arg("is_streaming"), py::arg("end_id") = std::nullopt, py::arg("pad_id") = std::nullopt,
Expand All @@ -362,7 +363,8 @@ void initBindings(pybind11::module_& m)
py::arg("eagle_config") = std::nullopt, py::arg("skip_cross_attn_blocks") = std::nullopt,
py::arg("return_perf_metrics") = false, py::arg("guided_decoding_params") = std::nullopt,
py::arg("language_adapter_uid") = std::nullopt, py::arg("allotted_time_ms") = std::nullopt,
py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt)
py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt,
py::arg("arrival_time") = std::nullopt)
.def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, py::arg("vocab_size"))
.def(py::init<tb::LlmRequest const&>())
.def("validate", &tb::LlmRequest::validate, py::arg("max_input_len"), py::arg("max_seq_len"),
Expand Down
3 changes: 2 additions & 1 deletion cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
mLanguageAdapterUid, //
mAllottedTimeMs, //
mContextPhaseParams, //
mCacheSaltID //
mCacheSaltID, //
mPerfMetrics.timingMetrics.arrivalTime //
);
}
Loading