diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index cd160ba48cf..68ba1a84de2 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -159,15 +159,20 @@ def _forward_nope( q = self._attention_scaling(q, position_ids) out_scale = None + out_scale_sf = None if self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales: out_scale = self.o_proj.inv_input_scale + if self.o_proj.has_nvfp4 and self.support_nvfp4_output: + out_scale_sf = self.o_proj.input_scale + q, k, v = self.convert_qkv(q, k, v) attn_output = self.attn.forward(q, k, v, attn_metadata, out_scale=out_scale, + out_scale_sf=out_scale_sf, attention_mask=attention_mask, mrope_config=mrope_config)