From 626e21bf271307cab2146d1476c55392df537748 Mon Sep 17 00:00:00 2001
From: arvinder004 <asdhoul004@gmail.com>
Date: Fri, 3 Oct 2025 15:54:55 +0530
Subject: [PATCH 1/3] rewritten test_offline_mode

---
 tests/utils/test_offline.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/tests/utils/test_offline.py b/tests/utils/test_offline.py
index 357005eb575b..ff11c26dfb98 100644
--- a/tests/utils/test_offline.py
+++ b/tests/utils/test_offline.py
@@ -22,7 +22,6 @@
 
 class OfflineTests(TestCasePlus):
     @require_torch
-    @unittest.skip("This test is failing on main")  # TODO matt/ydshieh, this test needs to be fixed
     def test_offline_mode(self):
         # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
         # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
@@ -49,17 +48,12 @@ def test_offline_mode(self):
 def offline_socket(*args, **kwargs): raise RuntimeError("Offline mode is enabled, we shouldn't access internet")
 socket.socket = offline_socket
         """
+        # First subprocess run to warm the cache (online, no mocking)
+        stdout, _ = self._execute_with_env(load, run)
+        self.assertIn("success", stdout)
 
-        # Force fetching the files so that we can use the cache
-        mname = "hf-internal-testing/tiny-random-bert"
-        BertConfig.from_pretrained(mname)
-        BertModel.from_pretrained(mname)
-        BertTokenizer.from_pretrained(mname)
-        pipeline(task="fill-mask", model=mname)
-
-        # baseline - just load from_pretrained with normal network
-        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
-        stdout, _ = self._execute_with_env(load, run, mock, TRANSFORMERS_OFFLINE="1")
+        # Second subprocess run in offline mode: ensure no network and use local cache only
+        stdout, _ = self._execute_with_env(load, mock, run, HF_HUB_OFFLINE="1")
         self.assertIn("success", stdout)
 
     @require_torch

From c190625b48d737180873f3caba8eaaa3da93269c Mon Sep 17 00:00:00 2001
From: arvinder004 <asdhoul004@gmail.com>
Date: Fri, 3 Oct 2025 16:12:19 +0530
Subject: [PATCH 2/3] removed unused imports

---
 tests/models/gpt2/test_modeling_gpt2.py | 29 +++++++++++++++++++++++++
 tests/utils/test_offline.py             |  1 -
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 89a4bf545310..1dec76759078 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -281,6 +281,35 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
         super().test_training_gradient_checkpointing_use_reentrant_false()
         self.all_model_classes = self.original_all_model_classes
 
+    @require_torch_gpu
+    def test_dtype_device_parity_logits_fp32_cpu_vs_fp16_cuda(self):
+        # minimal parity check: logits should be close across dtype/device for the same weights
+        torch.manual_seed(0)
+        tester = self.model_tester
+        config, inputs_dict = tester.prepare_config_and_inputs_for_common()
+
+        # Create a single base model on CPU in fp32
+        base_model = GPT2LMHeadModel(config).eval()
+
+        with torch.no_grad():
+            # Run on CPU fp32
+            cpu_inputs = {k: v.to("cpu") for k, v in inputs_dict.items()}
+            cpu_logits = base_model(**cpu_inputs).logits
+
+            # Clone weights to a CUDA fp16 copy
+            cuda_model = GPT2LMHeadModel(config).eval()
+            cuda_model.load_state_dict(base_model.state_dict())
+            cuda_model = cuda_model.to("cuda", dtype=torch.float16)
+
+            cuda_inputs = {k: v.to("cuda") for k, v in inputs_dict.items()}
+            cuda_logits = cuda_model(**cuda_inputs).logits.to(dtype=torch.float32, device="cpu")
+
+        # Compare with relaxed tolerances to accommodate dtype differences
+        self.assertEqual(cpu_logits.shape, cuda_logits.shape)
+        max_abs_diff = (cpu_logits - cuda_logits).abs().max().item()
+        # fp16 numerical noise tolerance
+        self.assertLessEqual(max_abs_diff, 5e-3)
+
 
 @require_torch
 class GPT2ModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/utils/test_offline.py b/tests/utils/test_offline.py
index ff11c26dfb98..9671cc804a39 100644
--- a/tests/utils/test_offline.py
+++ b/tests/utils/test_offline.py
@@ -14,7 +14,6 @@
 
 import subprocess
 import sys
-import unittest
 
 from transformers import BertConfig, BertModel, BertTokenizer, pipeline
 from transformers.testing_utils import TestCasePlus, require_torch

From f3503a21ae888951455b41f549100d9a55f3d2a1 Mon Sep 17 00:00:00 2001
From: arvinder004 <asdhoul004@gmail.com>
Date: Fri, 3 Oct 2025 16:32:39 +0530
Subject: [PATCH 3/3] removed unrelated changes

---
 tests/models/gpt2/test_modeling_gpt2.py | 29 -------------------------
 1 file changed, 29 deletions(-)

diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 1dec76759078..89a4bf545310 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -281,35 +281,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
         super().test_training_gradient_checkpointing_use_reentrant_false()
         self.all_model_classes = self.original_all_model_classes
 
-    @require_torch_gpu
-    def test_dtype_device_parity_logits_fp32_cpu_vs_fp16_cuda(self):
-        # minimal parity check: logits should be close across dtype/device for the same weights
-        torch.manual_seed(0)
-        tester = self.model_tester
-        config, inputs_dict = tester.prepare_config_and_inputs_for_common()
-
-        # Create a single base model on CPU in fp32
-        base_model = GPT2LMHeadModel(config).eval()
-
-        with torch.no_grad():
-            # Run on CPU fp32
-            cpu_inputs = {k: v.to("cpu") for k, v in inputs_dict.items()}
-            cpu_logits = base_model(**cpu_inputs).logits
-
-            # Clone weights to a CUDA fp16 copy
-            cuda_model = GPT2LMHeadModel(config).eval()
-            cuda_model.load_state_dict(base_model.state_dict())
-            cuda_model = cuda_model.to("cuda", dtype=torch.float16)
-
-            cuda_inputs = {k: v.to("cuda") for k, v in inputs_dict.items()}
-            cuda_logits = cuda_model(**cuda_inputs).logits.to(dtype=torch.float32, device="cpu")
-
-        # Compare with relaxed tolerances to accommodate dtype differences
-        self.assertEqual(cpu_logits.shape, cuda_logits.shape)
-        max_abs_diff = (cpu_logits - cuda_logits).abs().max().item()
-        # fp16 numerical noise tolerance
-        self.assertLessEqual(max_abs_diff, 5e-3)
-
 
 @require_torch
 class GPT2ModelLanguageGenerationTest(unittest.TestCase):