diff --git a/requirements/common.txt b/requirements/common.txt index e21abfb9a30b..ce0795488cc1 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -25,7 +25,7 @@ outlines == 0.1.11 ; platform_machine == "s390x" # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 -xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" +xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 96dafd6add67..c81bc58f1ef4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -90,15 +90,11 @@ if TYPE_CHECKING: import xgrammar as xgr - import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile # noqa: E501 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput else: xgr = LazyLoader("xgr", globals(), "xgrammar") - xgr_torch_compile = LazyLoader( - "xgr_torch_compile", globals(), - "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile") logger = init_logger(__name__) @@ -1333,10 +1329,7 @@ def apply_grammar_bitmask( # so we receive it in that format. grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous() - # Force use of the torch.compile implementation from xgrammar to work - # around issues with the Triton kernel in concurrent structured output - # scenarios. See PR #19565 and issues #19493, #18376 for details. - xgr_torch_compile.apply_token_bitmask_inplace_torch_compile( + xgr.apply_token_bitmask_inplace( logits, grammar_bitmask.to(self.device, non_blocking=True), indices=out_indices if not skip_out_indices else None,