Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ GuidedDecoder::GuidedDecoder(executor::GuidedDecodingConfig const& guidedDecodin
, mLogitsDtype{logitsDtype}
, mCopyBufferManager{std::make_shared<CudaStream>()}
{
TLLM_CHECK_WITH_INFO(mGuidedDecodingBackend != executor::GuidedDecodingConfig::GuidedDecodingBackend::kLLGUIDANCE,
"LLGuidance is not supported for guided decoding in C++ runtime.");
if (mGuidedDecodingBackend == executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR)
{
mXGrammarMatchers.resize(mMaxNumSequences);
Expand Down
3 changes: 3 additions & 0 deletions cpp/tensorrt_llm/executor/executorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1621,6 +1621,9 @@ std::tuple<Executor::Impl::RequestList, double> Executor::Impl::fetchNewRequests
TLLM_CHECK_WITH_INFO(mModel->hasGuidedDecoder(),
"Request is specified with GuidedDecodingParams, but GuidedDecoder is not setup. Please "
"provide a valid GuidedDecodingConfig to setup GuidedDecoder.");
TLLM_CHECK_WITH_INFO(newReq->getGuidedDecodingParams()->getGuideType()
!= executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG,
"Structural tag is not supported for guided decoding in C++ Executor.");
}

if (mModel->getWorldConfig().isLastPipelineParallelRank() && newReq->hasAdditionalOutputs())
Expand Down
3 changes: 1 addition & 2 deletions examples/trtllm-eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ We provide a CLI tool `trtllm-eval` for evaluating model accuracy. It shares the
pip install -r requirements.txt

# Evaluate Llama-3.1-8B-Instruct on MMLU
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar && tar -xf data.tar
trtllm-eval --model meta-llama/Llama-3.1-8B-Instruct mmlu --dataset_path data
trtllm-eval --model meta-llama/Llama-3.1-8B-Instruct mmlu

# Evaluate Llama-3.1-8B-Instruct on GSM8K
trtllm-eval --model meta-llama/Llama-3.1-8B-Instruct gsm8k
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def create(self,
compiled_grammar = self._xgrammar_compiler.compile_structural_tag(
structures, triggers)
case _:
raise ValueError(f"Unrecognized guide type: {guide_type}.")
raise ValueError(f"Unsupported guide type: {guide_type}.")

matcher = xgrammar.GrammarMatcher(compiled_grammar)
return XGrammarMatcher(matcher)
Expand Down Expand Up @@ -167,7 +167,7 @@ def create(
# provide Lark-formatted grammar instead of standard EBNF.
grammar = llguidance.LLMatcher.grammar_from_lark(guide)
case _:
raise ValueError(f"Unrecognized guide type: {guide_type}.")
raise ValueError(f"Unsupported guide type: {guide_type}.")

matcher = llguidance.LLMatcher(self._tokenizer, grammar)
if matcher.is_error():
Expand Down
8 changes: 7 additions & 1 deletion tensorrt_llm/evaluate/json_mode_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from typing import Iterable, List, Optional, Union

import click
Expand Down Expand Up @@ -56,8 +57,13 @@ def generate_samples(self) -> Iterable[tuple]:
for i, sample in enumerate(self.data):
if i >= self.num_samples:
break
schema = sample["schema"]
if os.environ.get("TRTLLM_XGUIDANCE_LENIENT") == "1":
schema = json.loads(schema)
schema["x-guidance"] = {"lenient": True}
schema = json.dumps(schema)
sampling_args = {
"guided_decoding": GuidedDecodingParams(json=sample["schema"])
"guided_decoding": GuidedDecodingParams(json=schema)
}
yield sample["prompt"], sampling_args, sample["completion"]

Expand Down
7 changes: 5 additions & 2 deletions tensorrt_llm/llmapi/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,8 +879,11 @@ class BaseLlmArgs(BaseModel):
enable_chunked_prefill: bool = Field(default=False,
description="Enable chunked prefill.")

guided_decoding_backend: Optional[str] = Field(
default=None, description="Guided decoding backend.")
guided_decoding_backend: Optional[Literal["xgrammar", "llguidance"]] = Field(
default=None,
description=
"Guided decoding backend. llguidance is supported in PyTorch backend only."
)

batched_logits_processor: Optional[object] = Field(
default=None,
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/sampling_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class GuidedDecodingParams:
regex (str, optional): The generated text is amenable to the user-specified regular expression. Defaults to None.
grammar (str, optional): The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. Defaults to None.
json_object (bool): If True, the generated text is amenable to json format. Defaults to False.
structural_tag (str, optional): The generated text is amenable to the user-specified structural tag. Defaults to None.
structural_tag (str, optional): The generated text is amenable to the user-specified structural tag. Structural tag is supported by xgrammar in PyTorch backend only. Defaults to None.
""" # noqa: E501

json: Optional[Union[str, BaseModel, dict]] = None
Expand Down
10 changes: 6 additions & 4 deletions tests/integration/defs/accuracy/test_llm_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,18 @@ def test_tp2cp2(self):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

def test_guided_decoding(self):
llm = LLM(self.MODEL_PATH, guided_decoding_backend="xgrammar")
@pytest.mark.parametrize("backend", ["xgrammar"])
def test_guided_decoding(self, backend: str):
llm = LLM(self.MODEL_PATH, guided_decoding_backend=backend)
with llm:
task = JsonModeEval(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip_less_device(4)
def test_guided_decoding_4gpus(self):
@pytest.mark.parametrize("backend", ["xgrammar"])
def test_guided_decoding_4gpus(self, backend: str):
llm = LLM(self.MODEL_PATH,
guided_decoding_backend="xgrammar",
guided_decoding_backend=backend,
tensor_parallel_size=2,
pipeline_parallel_size=2)
with llm:
Expand Down
14 changes: 10 additions & 4 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import pytest

from tensorrt_llm import LLM
Expand Down Expand Up @@ -277,19 +279,23 @@ def test_ngram(self):
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

def test_guided_decoding(self):
@pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
def test_guided_decoding(self, backend: str, mocker):
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
llm = LLM(self.MODEL_PATH,
guided_decoding_backend="xgrammar",
guided_decoding_backend=backend,
disable_overlap_scheduler=True,
use_cuda_graph=True)
with llm:
task = JsonModeEval(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip_less_device(4)
def test_guided_decoding_4gpus(self):
@pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
def test_guided_decoding_4gpus(self, backend: str, mocker):
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
llm = LLM(self.MODEL_PATH,
guided_decoding_backend="xgrammar",
guided_decoding_backend=backend,
disable_overlap_scheduler=True,
use_cuda_graph=True,
tensor_parallel_size=2,
Expand Down
10 changes: 6 additions & 4 deletions tests/integration/test_lists/qa/examples_test_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -418,8 +418,8 @@ accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_weight_only
accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
Expand All @@ -440,8 +440,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_lists/test-db/l0_a100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ l0_a100:
- accuracy/test_cli_flow.py::TestLlama3_2_1B::test_smooth_quant_ootb_manage_weights
- accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_int8_gptq
- accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
- condition:
ranges:
system_gpu_count:
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_lists/test-db/l0_dgx_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ l0_dgx_h100:
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
- disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_lists/test-db/l0_dgx_h200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ l0_dgx_h200:
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
- condition:
ranges:
system_gpu_count:
Expand Down Expand Up @@ -120,7 +121,7 @@ l0_dgx_h200:
- accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_tp2
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_cp2
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_tp2cp2
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
- examples/test_llama.py::test_llm_llama_long_alpaca_8gpu_summary[pg64317-tp4pp2-nb:4]
- examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]
- examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora]
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_lists/test-db/l0_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ l0_h100:
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=FLASHINFER-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90)
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=FLASHINFER-torch_compile=False]
Expand Down Expand Up @@ -182,6 +182,7 @@ l0_h100:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
- condition:
ranges:
system_gpu_count:
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5349343)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5349343)
full:B200/test_e2e.py::test_ptp_quickstart_advanced_deepseek_multi_nodes[DeepSeek-R1/DeepSeek-R1-0528-FP4] SKIP (https://nvbugs/5344688)
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus SKIP (https://nvbugs/5346443)
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443)
test_e2e.py::test_openai_reasoning SKIP (https://nvbugs/5355091)
test_e2e.py::test_openai_misc_example SKIP (https://nvbugs/5355091)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] SKIP (https://nvbugs/5354956)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ methods:
annotation: Optional[tensorrt_llm.sampling_params.BatchedLogitsProcessor]
default: null
guided_decoding_backend:
annotation: Optional[str]
annotation: Optional[Literal["xgrammar", "llguidance"]]
default: null
# Quantization and calibration
quant_config:
Expand Down