NVIDIA · dc3671 · Jul 10, 2025 · Jul 9, 2025
@@ -123,7 +123,7 @@ def get_masked_input_and_mask(
 
 
 # We use torch.compile() to fuse the tiny pointwise ops before all_reduce/all_gather for Embedding module.
-@torch.compile(mode="max-autotune-no-cudagraphs")
+@torch.compile(options={"max-autotune": True})
 def pre_comm_embedding_ops(
     input_: torch.Tensor,
     weight: torch.Tensor,

@@ -335,7 +335,7 @@ def __init__(self, spec_config: Eagle3Config, mapping: Mapping):
         self.max_draft_tokens = self.spec_config.max_draft_tokens
         self.mapping = mapping
 
-    @torch.compile(mode="max-autotune-no-cudagraphs")
+    @torch.compile(options={"max-autotune": True})
     def forward(self, input_ids, position_ids, hidden_states, logits,
                 attn_metadata, spec_metadata, draft_model):
         batch_size = attn_metadata.num_seqs