Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/lora/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def set_active_loras(worker: Union[Worker, V1Worker],
seed=0,
dtype="float16",
revision=None,
enforce_eager=True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you planning on keeping this eager?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it testing code that should be removed before this pr is ready?

Copy link
Contributor Author

@varun-sundar-rabindranath varun-sundar-rabindranath Mar 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intend to keep it. The CI test was running out of memory, which I assume is because of the cudagraph capture.

also, that specific test, doesn't actually run the model.

),
load_config=LoadConfig(
download_dir=None,
Expand Down
25 changes: 19 additions & 6 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2287,9 +2287,14 @@ def compute_hash(self) -> str:
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# LoRA is not compatible with `torch.compile` .
factors: list[Any] = []
factors.append(self.max_lora_rank)
factors.append(self.max_loras)
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
factors.append(self.lora_extra_vocab_size)
factors.append(self.long_lora_scaling_factors)
factors.append(self.bias_enabled)
hash_str = hashlib.md5(str(factors).encode()).hexdigest()
return hash_str

Expand Down Expand Up @@ -3303,6 +3308,11 @@ def compute_hash(self) -> str:
vllm_factors.append("None")
if self.lora_config:
vllm_factors.append(self.lora_config.compute_hash())
# LoRA creates static buffers based on max_num_batched_tokens.
# The tensor sizes and strides get captured in the torch.compile
# graph explicitly.
vllm_factors.append(
str(self.scheduler_config.max_num_batched_tokens))
else:
vllm_factors.append("None")
if self.speculative_config:
Expand Down Expand Up @@ -3453,12 +3463,15 @@ def __post_init__(self):
" Disabling `torch.compile`.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION

if self.lora_config is not None and self.compilation_config.level !=\
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is V0 lora compatible with torch.compile ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not at the moment. The SGMV ops input some forward pass specific metadata, such as token_nums and max_seq_length as python integers and IIUC, during tracing these are captured as constants but they shouldn't be.

The WIth the lora/layers.py changes in this PR and with #14685 , V0 LoRA should become compatible.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the information. then can you keep the assert in v0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. re-introduced the check for V0 👍

CompilationLevel.NO_COMPILATION:
logger.warning("LoRA is not supported with `torch.compile` yet. "
"Disabling `torch.compile`.")
if ((not envs.VLLM_USE_V1) and self.lora_config is not None
and self.compilation_config.level
!= CompilationLevel.NO_COMPILATION):
logger.warning(
"LoRA for V0 is not supported with `torch.compile` yet. "
"Disabling `torch.compile`.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION


if self.model_config and self.model_config.use_mla and \
not (current_platform.is_cuda() or current_platform.is_rocm()):
logger.info(
Expand Down
15 changes: 9 additions & 6 deletions vllm/lora/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,16 +237,19 @@ def set_lora(
self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)

def forward(self, x: torch.Tensor) -> torch.Tensor:
added_tokens_mask = x > self.base_layer.org_vocab_size - 1
embeddings_indices = self.punica_wrapper.embeddings_indices
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah this is the culprit. it is actually a function, and slices a tensor using a python int, which will fail the symbolic shape compilation. changing to torch.narrow with x.size(0) is the correct fix 👍

indices = embeddings_indices[1].view_as(x)
added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1,
1, 0)
embeddings_indices = torch.narrow(
self.punica_wrapper._embeddings_indices, 1, 0, x.size(0))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ changes are to avoid errors such as,

  raise ConstraintViolationError(
torch.fx.experimental.symbolic_shapes.ConstraintViolationError: Constraints violated (L['input_ids'].size()[0], L['positions'].size()[0])! For more information, run with TORCH_LOGS="+dynamic".
  - Not all values of RelaxedUnspecConstraint(L['input_ids'].size()[0]) are valid because L['input_ids'].size()[0] was inferred to be a constant (8192).
  - Not all values of RelaxedUnspecConstraint(L['positions'].size()[0]) are valid because L['positions'].size()[0] was inferred to be a constant (8192).

indices = embeddings_indices[1]
full_lora_a_embeddings = F.embedding(
x + indices,
self.lora_a_stacked_2d,
)
indices = embeddings_indices[0].view_as(x)
full_output = self.base_layer.forward(
x.add_(indices * added_tokens_mask))
indices = embeddings_indices[0]
full_output = self.base_layer.forward(x +
(indices * added_tokens_mask))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

x here is the input_ids. In V1, we don't zero out the cuda graph pad region.
Avoid the in-place update here to prevent accumulating garbage into the input buffer.


full_output_org = full_output
if full_output.ndim == 3:
Expand Down
8 changes: 6 additions & 2 deletions vllm/lora/punica_wrapper/punica_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,9 @@ def add_expand(self,
y_org = y
y = y.view(-1, y.shape[-1])
if lora_bias_stacked is not None:
self._apply_bias(self.token_lora_indices, y, output_slices,
token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
y.size(0))
self._apply_bias(token_lora_indices, y, output_slices,
lora_bias_stacked)

if env.VLLM_USE_V1:
Expand Down Expand Up @@ -365,7 +367,9 @@ def add_lora_linear(self,
assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
if lora_bias_stacked is not None:
assert len(lora_bias_stacked) == len(output_slices)
y = self._apply_bias(self.token_lora_indices, y, output_slices,
token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
y.size(0))
y = self._apply_bias(token_lora_indices, y, output_slices,
lora_bias_stacked)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ Changes are to avoid errors such as,

  raise ConstraintViolationError(
torch.fx.experimental.symbolic_shapes.ConstraintViolationError: Constraints violated (L['input_ids'].size()[0], L['positions'].size()[0])! For more information, run with TORCH_LOGS="+dynamic".
  - Not all values of RelaxedUnspecConstraint(L['input_ids'].size()[0]) are valid because L['input_ids'].size()[0] was inferred to be a constant (8192).
  - Not all values of RelaxedUnspecConstraint(L['positions'].size()[0]) are valid because L['positions'].size()[0] was inferred to be a constant (8192).


if buffer is None:
Expand Down