diff --git a/notebooks/en/trl_grpo_reasoning_advanced_reward.ipynb b/notebooks/en/trl_grpo_reasoning_advanced_reward.ipynb index dd8c756c..62b33827 100644 --- a/notebooks/en/trl_grpo_reasoning_advanced_reward.ipynb +++ b/notebooks/en/trl_grpo_reasoning_advanced_reward.ipynb @@ -497,6 +497,7 @@ "training_args = GRPOConfig(\n", " # Learning parameters optimized for reasoning tasks\n", " learning_rate=5e-6, # Conservative LR to prevent destabilizing reasoning\n", + " bf16=False,", " \n", " # Memory-efficient batch configuration\n", " per_device_train_batch_size=2, # Small batch for GPU memory constraints\n",