diff --git a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h index 43194db37a3..ce42493879e 100644 --- a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h +++ b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h @@ -79,6 +79,16 @@ class CreateNewDecoderRequests : Algorithm runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength, SizeType32 beamWidth, OptionalRef medusaBuffers) const; + [[nodiscard]] std::tuple, + std::vector> + createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds, + executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState, + runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, + runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, + runtime::CudaStream const& runtimeStream, runtime::CudaStream const& decoderStream, + SizeType32 maxSequenceLength, OptionalRef medusaBuffers) const; + +private: //! @brief Initialize the decoder at `batchSlot` with a new `request`. Exposed only for static batching via //! GptDecoderBatched::newBatch() static void newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request, @@ -86,7 +96,6 @@ class CreateNewDecoderRequests : Algorithm runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength); -private: //! @brief Setups decoder internal tensors for new speculative decoding request static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig, @@ -114,15 +123,6 @@ class CreateNewDecoderRequests : Algorithm static void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream); - [[nodiscard]] std::tuple, - std::vector> - createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds, - executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState, - runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, - runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, - runtime::CudaStream const& runtimeStream, runtime::CudaStream const& decoderStream, - SizeType32 maxSequenceLength, OptionalRef medusaBuffers) const; - [[nodiscard]] std::shared_ptr retrieveDraftLogits(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, std::shared_ptr const& tensor, runtime::BufferManager const& bufferManager) const; diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp index e6b06677d69..37e344b0c94 100644 --- a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp +++ b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp @@ -660,6 +660,7 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon decoderRequest.stopWordsList->squeeze(0); } + TLLM_CHECK(llmReq->mSeqSlot.has_value()); newRequest(llmReq->mSeqSlot.value(), decoderRequest, llmReq->mSamplingConfig, modelConfig, decoderState, runtimeStream, decoderStream, maxSequenceLength); diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp index acabb4488b9..6fa48e62f45 100644 --- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp +++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp @@ -54,18 +54,72 @@ struct DecoderInputs TensorPtr srcCacheIndirection; }; -void newRequests(TensorPtr const& batchSlots, std::vector const& requests, - std::vector const& samplingConfigs, ModelConfig const& modelConfig, GptDecoderBatched& decoder, - CudaStream const& runtimeStream, SizeType32 maxSequenceLength, decoder::DecoderState& decoderState) +std::shared_ptr createLlmRequest(SizeType32 batchSlot, SizeType32 inputLengths, + SizeType32 generatedTokensPerSteps, SizeType32 acceptedTokensPerStep, TokenIdType inputTokenId, + TokenIdType expectedTokenId, SizeType32 maxNewTokens, SamplingConfig const& samplingConfig, TokenIdType endId) { - auto const& decoderStream = *decoder.getDecoderStream(); + auto constexpr requestId = 0; + auto inputTokens = std::make_shared(inputLengths, inputTokenId); + bool isStreaming = false; + auto request + = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, endId); + request->mSeqSlot = batchSlot; + + if (generatedTokensPerSteps > 1) + { + TokenIdType constexpr tokenToReject{1}; + TLLM_CHECK(tokenToReject != expectedTokenId); + // fill with tokens to reject + auto draftTokens = std::make_shared(generatedTokensPerSteps - 1, tokenToReject); + std::fill(draftTokens->begin(), draftTokens->begin() + acceptedTokensPerStep, expectedTokenId); + request->setDraftTokens(draftTokens); + } + + return request; +} +std::vector> createLlmRequests(std::vector const& inputLengths, + std::vector const& generatedTokensPerSteps, std::vector const& acceptedTokensPerStep, + TokenIdType inputTokenId, TokenIdType expectedTokenId, TensorPtr const& batchSlots, + std::vector const& allSamplingConfigs, SizeType32 maxNewTokens, SizeType32 endId) +{ auto batchSlotsRange = BufferRange(*batchSlots); auto const localBatchSize = batchSlots->getSize(); + + std::vector> requests; for (size_t bi = 0; bi < localBatchSize; ++bi) { - tb::CreateNewDecoderRequests::newRequest(batchSlotsRange[bi], requests[bi], samplingConfigs[bi], modelConfig, - decoderState, runtimeStream, decoderStream, maxSequenceLength); + auto const batchSlot = batchSlotsRange[bi]; + auto llmReq = createLlmRequest(batchSlot, inputLengths[batchSlot], generatedTokensPerSteps[batchSlot], + acceptedTokensPerStep[batchSlot], inputTokenId, expectedTokenId, maxNewTokens, + allSamplingConfigs[batchSlot], endId); + requests.emplace_back(std::move(llmReq)); + } + + return requests; +} + +void newRequests(std::vector> const& requests, TensorPtr const& batchSlots, + nvinfer1::DataType logitsType, ModelConfig const& modelConfig, WorldConfig const& worldConfig, + tle::DecodingConfig const& decodingConfig, GptDecoderBatched& decoder, CudaStream const& runtimeStream, + SizeType32 maxSequenceLength, tb::DecoderInputBuffers& inputBuffers, decoder::DecoderState& decoderState) +{ + auto const& decoderStream = *decoder.getDecoderStream(); + auto const bufferManager = BufferManager{std::make_shared(runtimeStream.get())}; + + auto batchSlotsRange = BufferRange(*batchSlots); + auto const localBatchSize = batchSlots->getSize(); + + tb::CreateNewDecoderRequests createNewDecoderRequests(false, false, false); + auto [lookaheadPrompt, lookaheadAlgoConfigs] = createNewDecoderRequests.createDecoderRequests(requests, + inputBuffers.inputsIds, decodingConfig, decoderState, bufferManager, logitsType, modelConfig, worldConfig, + runtimeStream, decoderStream, maxSequenceLength, std::nullopt); + + std::vector samplingConfigs; + samplingConfigs.reserve(requests.size()); + for (auto const& llmReq : requests) + { + samplingConfigs.emplace_back(llmReq->mSamplingConfig); } // Setup underlying decoder. @@ -124,38 +178,6 @@ decoder_batch::Output createDecoderOutputs(SizeType32 batchSize, SizeType32 maxB return outputs; } -std::vector prepareRequests(SizeType32 batchSize, SizeType32 maxNewTokens, - std::vector const& inputLengths, std::vector const& generatedTokensPerSteps, - std::vector const& acceptedTokensPerStep, TokenIdType inputTokenId, TokenIdType expectedTokenId, - TokenIdType endId, BufferManager const& manager) -{ - auto const& stream = manager.getStream(); - - std::vector requests; - requests.reserve(batchSize); - for (auto batchIdx = 0; batchIdx < batchSize; ++batchIdx) - { - auto shape = ITensor::makeShape({inputLengths[batchIdx]}); - auto input = manager.gpu(shape, TRTDataType::value); - kernels::invokeFill(*input, inputTokenId, stream); - - requests.emplace_back(std::move(input), inputLengths[batchIdx], maxNewTokens, endId); - if (generatedTokensPerSteps[batchIdx] > 1) - { - TokenIdType constexpr tokenToReject{1}; - TLLM_CHECK(tokenToReject != expectedTokenId); - // fill with tokens to reject - std::vector draftTokens(generatedTokensPerSteps[batchIdx] - 1, tokenToReject); - // fill with tokens to accept - std::fill(draftTokens.begin(), draftTokens.begin() + acceptedTokensPerStep[batchIdx], expectedTokenId); - requests.back().draftTokens = manager.copyFrom(draftTokens, MemoryType::kGPU); - requests.back().generatedTokensPerEngineStep = generatedTokensPerSteps[batchIdx]; - } - } - - return requests; -} - [[nodiscard]] std::vector getFinished( ITensor const& finishedSum, std::vector const& samplingConfigs, BufferManager& manager) { @@ -303,14 +325,13 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa auto constexpr inputTokenId = 1; auto constexpr expectedTokenId = 1023; - auto requests = prepareRequests(batchSize, maxNewTokens, inputLengths, generatedTokensPerSteps, - acceptedTokensPerStep, inputTokenId, expectedTokenId, endId, manager); // We set maxAttentionWindow = maxSeqLength, but it can be smaller than maxSeqLength (cyclic kv cache). auto const maxAttentionWindow = maxSeqLength; SizeType32 const sinkTokenLength{0}; auto const decodingMode = maxBeamWidth == 1 ? tle::DecodingMode::TopKTopP() : tle::DecodingMode::BeamSearch(); + tle::DecodingConfig decodingConfig{decodingMode}; // set up decoder auto decoder = GptDecoderBatched(streamPtr); @@ -336,9 +357,10 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa auto outputs = createDecoderOutputs( batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths, *decoderState.getSequenceLengths(), manager); - std::vector decoderRequests; - newRequests(inputBuffers.setupBatchSlots, requests, samplingConfigs, modelConfig, decoder, *streamPtr, maxSeqLength, - decoderState); + auto requests = createLlmRequests(inputLengths, generatedTokensPerSteps, acceptedTokensPerStep, inputTokenId, + expectedTokenId, inputBuffers.setupBatchSlots, samplingConfigs, maxNewTokens, endId); + newRequests(requests, inputBuffers.setupBatchSlots, dataType, modelConfig, worldConfig, decodingConfig, decoder, + *streamPtr, maxSeqLength, inputBuffers, decoderState); cudaDeviceSynchronize(); auto expectedLengths = tiledInputLengths; @@ -381,9 +403,10 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager); TensorPtr batchSlotsView = ITensor::slice(inputBuffers.setupBatchSlots, 0, 1); - std::vector singleConfig = {samplingConfigs[0]}; - newRequests( - batchSlotsView, {requests[0]}, singleConfig, modelConfig, decoder, *streamPtr, maxSeqLength, decoderState); + requests = createLlmRequests(inputLengths, generatedTokensPerSteps, acceptedTokensPerStep, inputTokenId, + expectedTokenId, batchSlotsView, samplingConfigs, maxNewTokens, endId); + newRequests(requests, batchSlotsView, dataType, modelConfig, worldConfig, decodingConfig, decoder, *streamPtr, + maxSeqLength, inputBuffers, decoderState); EXPECT_FALSE(getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager)[0]); } @@ -442,14 +465,13 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector const singleConfig = {samplingConfigs[batchIdx]}; - newRequests(newBatchSlot, {requests[batchIdx]}, singleConfig, modelConfig, decoder, *streamPtr, maxSeqLength, - decoderState); + auto requests = createLlmRequests(inputLengths, generatedTokensPerSteps, acceptedTokensPerStep, inputTokenId, + expectedTokenId, newBatchSlot, samplingConfigs, maxNewTokens, endId); + newRequests(requests, newBatchSlot, dataType, modelConfig, worldConfig, decodingConfig, decoder, *streamPtr, + maxSeqLength, inputBuffers, decoderState); auto activeSlots = std::vector(batchIdx + 1); std::iota(activeSlots.begin(), activeSlots.end(), 0); @@ -597,14 +620,13 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector(*inputBuffers.setupBatchSlots); std::iota(batchSlotsRange.begin(), batchSlotsRange.end(), 0); - newRequests(inputBuffers.setupBatchSlots, requests, samplingConfigs, modelConfig, decoder, *streamPtr, maxSeqLength, - decoderState); + auto requests = createLlmRequests(inputLengths, generatedTokensPerSteps, acceptedTokensPerStep, inputTokenId, + expectedTokenId, inputBuffers.setupBatchSlots, samplingConfigs, maxNewTokens, endId); + newRequests(requests, inputBuffers.setupBatchSlots, dataType, modelConfig, worldConfig, decodingConfig, decoder, + *streamPtr, maxSeqLength, inputBuffers, decoderState); cudaDeviceSynchronize(); auto expectedLengths = tiledInputLengths;