diff --git a/3rdparty/xgrammar b/3rdparty/xgrammar index b14a281566d..774867ce410 160000 --- a/3rdparty/xgrammar +++ b/3rdparty/xgrammar @@ -1 +1 @@ -Subproject commit b14a281566d7a53a2bd1b09fc11ac735977f9e36 +Subproject commit 774867ce410ce1c5c7c10011cc0a9e20bd7894e2 diff --git a/cpp/tensorrt_llm/batch_manager/CMakeLists.txt b/cpp/tensorrt_llm/batch_manager/CMakeLists.txt index 26aae8ca052..5f7d774c0b0 100644 --- a/cpp/tensorrt_llm/batch_manager/CMakeLists.txt +++ b/cpp/tensorrt_llm/batch_manager/CMakeLists.txt @@ -59,7 +59,7 @@ set(SRCS file(GLOB_RECURSE XGRAMMAR_SRCS "${3RDPARTY_DIR}/xgrammar/cpp/*.cc") list(FILTER XGRAMMAR_SRCS EXCLUDE REGEX - "${3RDPARTY_DIR}/xgrammar/cpp/pybind/.*\\.cc") + "${3RDPARTY_DIR}/xgrammar/cpp/nanobind/.*\\.cc") list(APPEND SRCS ${XGRAMMAR_SRCS}) if(NOT WIN32) diff --git a/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp b/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp index a5a7502c330..040dcd147e9 100644 --- a/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp +++ b/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp @@ -18,8 +18,10 @@ #include "tensorrt_llm/batch_manager/guidedDecoder.h" #include "tensorrt_llm/batch_manager/decoderBuffers.h" #include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/logitsBitmask.h" +#include #include using namespace tensorrt_llm::runtime; @@ -41,20 +43,23 @@ GuidedDecoder::GuidedDecoder(executor::GuidedDecodingConfig const& guidedDecodin if (mGuidedDecodingBackend == executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR) { mXGrammarMatchers.resize(mMaxNumSequences); + xgrammar::VocabType vocabType = xgrammar::VocabType::RAW; + bool addPrefixSpace = false; auto const& tokenizerStr = guidedDecodingConfig.getTokenizerStr(); if (tokenizerStr) { - auto const& tokenizerInfo = xgrammar::TokenizerInfo::FromHuggingFace( - guidedDecodingConfig.getEncodedVocab().value(), guidedDecodingConfig.getTokenizerStr().value(), - mVocabSizePadded, guidedDecodingConfig.getStopTokenIds()); - mXGrammarCompiler = std::make_shared(tokenizerInfo); - } - else - { - auto const& tokenizerInfo = xgrammar::TokenizerInfo(guidedDecodingConfig.getEncodedVocab().value(), - xgrammar::VocabType::RAW, mVocabSizePadded, guidedDecodingConfig.getStopTokenIds()); - mXGrammarCompiler = std::make_shared(tokenizerInfo); + auto const& metadata = xgrammar::TokenizerInfo::DetectMetadataFromHF(tokenizerStr.value()); + auto const& metadataJson = nlohmann::json::parse(metadata); + vocabType = metadataJson.at("vocab_type").template get(); + addPrefixSpace = metadataJson.at("add_prefix_space").template get(); } + auto const& tokenizerInfo = xgrammar::TokenizerInfo(guidedDecodingConfig.getEncodedVocab().value(), vocabType, + mVocabSizePadded, guidedDecodingConfig.getStopTokenIds(), addPrefixSpace); + + auto const cacheLimitGb = common::getFloatEnv("XGRAMMAR_CACHE_LIMIT_GB"); + mXGrammarCompiler = std::make_shared(tokenizerInfo, /*max_threads=*/8, + /*cache_enabled=*/true, + /*cache_limit_bytes=*/static_cast(cacheLimitGb.value_or(1.0f) * 1024 * 1024 * 1024)); auto const logitsPtrDtype = BufferDataType{mLogitsDtype, false, true}; auto constexpr bitmaskDtype = TRTDataType::value; @@ -89,27 +94,56 @@ void GuidedDecoder::build(ScheduledRequests const& scheduledRequests) // The request is in the first context forward step (considering kv cache reuse). auto const& guideType = guidedDecodingParams->getGuideType(); auto const& guide = guidedDecodingParams->getGuide(); - if (guideType == executor::GuidedDecodingParams::GuideType::kJSON) + switch (guideType) + { + case executor::GuidedDecodingParams::GuideType::kJSON: { mXGrammarMatchers.at(seqSlot) = std::make_shared( mXGrammarCompiler->CompileBuiltinJSONGrammar()); + break; } - else if (guideType == executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA) + case executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA: { mXGrammarMatchers.at(seqSlot) = std::make_shared( mXGrammarCompiler->CompileJSONSchema(guide.value())); + break; } - else if (guideType == executor::GuidedDecodingParams::GuideType::kREGEX) + case executor::GuidedDecodingParams::GuideType::kREGEX: { auto const& grammar = xgrammar::Grammar::FromRegex(guide.value()); mXGrammarMatchers.at(seqSlot) = std::make_shared(mXGrammarCompiler->CompileGrammar(grammar)); + break; } - else if (guideType == executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR) + case executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR: { auto const& grammar = xgrammar::Grammar::FromEBNF(guide.value()); mXGrammarMatchers.at(seqSlot) = std::make_shared(mXGrammarCompiler->CompileGrammar(grammar)); + break; + } + case executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG: + { + auto const& structuralTagParametersJson = nlohmann::json::parse(guide.value()); + auto const& structuralTagItemsJson + = structuralTagParametersJson.at("structures").template get>(); + std::vector structuralTagItems; + for (auto const& s : structuralTagItemsJson) + { + structuralTagItems.emplace_back( + xgrammar::StructuralTagItem{s.at("begin").template get(), + s.at("schema").dump(), s.at("end").template get()}); + } + auto const& triggers + = structuralTagParametersJson.at("triggers").template get>(); + mXGrammarMatchers.at(seqSlot) = std::make_shared( + mXGrammarCompiler->CompileStructuralTag(structuralTagItems, triggers)); + break; + } + default: + { + TLLM_THROW("Unsupported guide type."); + } } } else if (llmReq->isGenerationInProgressState()) diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp index a4fb013b2fc..f7480229410 100644 --- a/cpp/tensorrt_llm/common/envUtils.cpp +++ b/cpp/tensorrt_llm/common/envUtils.cpp @@ -50,6 +50,17 @@ std::optional getUInt64Env(char const* name) return {val}; }; +std::optional getFloatEnv(char const* name) +{ + char const* const env = std::getenv(name); + if (env == nullptr) + { + return std::nullopt; + } + float const val = std::stof(env); + return {val}; +} + std::optional getStrEnv(char const* name) { char const* const env = std::getenv(name); diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h index e85ee6468f9..5e29dfaca71 100644 --- a/cpp/tensorrt_llm/common/envUtils.h +++ b/cpp/tensorrt_llm/common/envUtils.h @@ -27,6 +27,8 @@ std::optional getIntEnv(char const* name); std::optional getUInt64Env(char const* name); +std::optional getFloatEnv(char const* name); + bool getBoolEnv(char const* name); // XQA kernels (optimized kernels for generation phase). diff --git a/cpp/tensorrt_llm/executor/executorImpl.cpp b/cpp/tensorrt_llm/executor/executorImpl.cpp index b9df58f9a73..c2118bbf771 100644 --- a/cpp/tensorrt_llm/executor/executorImpl.cpp +++ b/cpp/tensorrt_llm/executor/executorImpl.cpp @@ -1621,9 +1621,6 @@ std::tuple Executor::Impl::fetchNewRequests TLLM_CHECK_WITH_INFO(mModel->hasGuidedDecoder(), "Request is specified with GuidedDecodingParams, but GuidedDecoder is not setup. Please " "provide a valid GuidedDecodingConfig to setup GuidedDecoder."); - TLLM_CHECK_WITH_INFO(newReq->getGuidedDecodingParams()->getGuideType() - != executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG, - "Structural tag is not supported for guided decoding in C++ Executor."); } if (mModel->getWorldConfig().isLastPipelineParallelRank() && newReq->hasAdditionalOutputs()) diff --git a/requirements.txt b/requirements.txt index 16c1e4b5f8c..3c55f8e16fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,7 +51,7 @@ peft einops flashinfer-python==0.2.5 opencv-python-headless -xgrammar==0.1.19 +xgrammar==0.1.21 backoff nvtx matplotlib # FIXME: this is added to make nvtx happy diff --git a/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py b/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py index 3982fc755fb..7d51af7ae19 100644 --- a/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py +++ b/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py @@ -49,21 +49,21 @@ class XGrammarMatcherFactory(GrammarMatcherFactory): def __init__(self, guided_decoding_config: GuidedDecodingConfig, vocab_size_padded: int): super().__init__() + vocab_type = xgrammar.VocabType.RAW + add_prefix_space = False if guided_decoding_config.tokenizer_str is not None: metadata = xgrammar.TokenizerInfo._detect_metadata_from_hf( guided_decoding_config.tokenizer_str) - tokenizer_info = xgrammar.TokenizerInfo( - guided_decoding_config.encoded_vocab, - vocab_type=metadata["vocab_type"], - vocab_size=vocab_size_padded, - stop_token_ids=guided_decoding_config.stop_token_ids, - add_prefix_space=metadata["add_prefix_space"]) - else: - tokenizer_info = xgrammar.TokenizerInfo( - guided_decoding_config.encoded_vocab, - xgrammar.VocabType.RAW, - vocab_size=vocab_size_padded, - stop_token_ids=guided_decoding_config.stop_token_ids) + vocab_type = metadata["vocab_type"] + add_prefix_space = metadata["add_prefix_space"] + + tokenizer_info = xgrammar.TokenizerInfo( + guided_decoding_config.encoded_vocab, + vocab_type=vocab_type, + vocab_size=vocab_size_padded, + stop_token_ids=guided_decoding_config.stop_token_ids, + add_prefix_space=add_prefix_space) + # Default cache limit is 1GB. cache_limit_gb = float(os.getenv("XGRAMMAR_CACHE_LIMIT_GB", "1")) cache_limit_bytes = int(cache_limit_gb * 1024 * 1024 * 1024) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index a563bc98f28..d900133ee4d 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -28,8 +28,7 @@ # yapf: disable # isort: off -from ..bindings.executor import ( - BatchingType as _BatchingType, +from ..bindings.executor import (BatchingType as _BatchingType, CacheTransceiverBackendType as _CacheTransceiverBackendType, CacheTransceiverConfig as _CacheTransceiverConfig, CapacitySchedulerPolicy as _CapacitySchedulerPolicy, diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py index d6da05d01bd..73012acb58f 100644 --- a/tensorrt_llm/sampling_params.py +++ b/tensorrt_llm/sampling_params.py @@ -19,7 +19,7 @@ class GuidedDecodingParams: regex (str, optional): The generated text is amenable to the user-specified regular expression. Defaults to None. grammar (str, optional): The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. Defaults to None. json_object (bool): If True, the generated text is amenable to json format. Defaults to False. - structural_tag (str, optional): The generated text is amenable to the user-specified structural tag. Structural tag is supported by xgrammar in PyTorch backend only. Defaults to None. + structural_tag (str, optional): The generated text is amenable to the user-specified structural tag. Structural tag is supported by xgrammar backend only. Defaults to None. """ # noqa: E501 json: Optional[Union[str, BaseModel, dict]] = None