yiliu30
diff --git a/‎test/dynamo/test_aot_autograd_cache.py‎
Lines changed: 255 additions & 4 deletions b/‎test/dynamo/test_aot_autograd_cache.py‎
Lines changed: 255 additions & 4 deletions
@@ -1,19 +1,270 @@
 # Owner(s): ["module: dynamo"]
 
+import os
+import unittest
+
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
 
 import torch._functorch._aot_autograd
+from torch._dynamo.utils import counters
 from torch._functorch import config as functorch_config
 from torch._functorch._aot_autograd.autograd_cache import (
-    autograd_cache_hash,
+    AOTAutogradCache,
+    autograd_cache_key,
     BypassAOTAutogradCache,
 )
 from torch._functorch._aot_autograd.schemas import AOTConfig
 from torch._inductor import config as inductor_config
-
-
+from torch.testing._internal.common_cuda import SM80OrLater
+from torch.testing._internal.common_device_type import largeTensorTest
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+
+
+@instantiate_parametrized_tests
+class AOTAutogradCacheTests(torch._dynamo.test_case.TestCase):
+    def setUp(self):
+        """
+        Reset all counters and caches before each unit test
+        """
+        super().setUp()
+        counters.clear()
+        self._clear_all_caches()
+
+    def _clear_all_caches(self):
+        """
+        Clear every cache, including AOTAutogradCache and FXCache
+        """
+        torch._inductor.codecache.FxGraphCache.clear()
+        AOTAutogradCache.clear()
+        self._clear_dynamo_and_codecache()
+
+    def _clear_dynamo_and_codecache(self):
+        """
+        Clear unrelated caches, like dynamo and PyCodeCache
+        """
+        torch._dynamo.reset()
+        for m in torch._inductor.codecache.PyCodeCache.cache.values():
+            os.remove(m.__file__)
+        torch._inductor.codecache.PyCodeCache.cache_clear()
+
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_basic(self):
+        """
+        Verify the interactions between FXGraphCache and AOTAutogradCache.
+        """
+
+        def fn(x, y):
+            return (x * 2, y @ y)
+
+        a = torch.rand(25)
+        b = torch.rand(5, 5)
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        # A first call should miss in the cache.
+        self.assertEqual(fn(a, b), compiled_fn(a, b))
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        # A second call should hit. (First reset so in-memory guards
+        # don't prevent compilation).
+        self._clear_dynamo_and_codecache()
+        self.assertEqual(fn(a, b), compiled_fn(a, b))
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_clear_fx_graph_cache(self):
+        """
+        Verify the interactions between FXGraphCache and AOTAutogradCache.
+        """
+
+        def fn(x, y):
+            return (x * 2, y @ y)
+
+        a = torch.rand(25)
+        b = torch.rand(5, 5)
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        # A first call should miss in the cache.
+        self.assertEqual(fn(a, b), compiled_fn(a, b))
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        # Clear FX graph cache: second call should also be a miss
+        self._clear_dynamo_and_codecache()
+        torch._inductor.codecache.FxGraphCache.clear()
+        self.assertEqual(fn(a, b), compiled_fn(a, b))
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        # We save again into the cache
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+
+    @inductor_config.patch("fx_graph_cache", False)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_fx_graph_cache_off(self):
+        """
+        Should not use cache if FXGraphCache is not enabled
+        """
+
+        def fn(x, y):
+            return (x * 2, y @ y)
+
+        a = torch.rand(25)
+        b = torch.rand(5, 5)
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        # A first call should miss in the cache.
+        self.assertEqual(fn(a, b), compiled_fn(a, b))
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
+
+        # Clear FX graph cache: second call should also be a miss
+        self._clear_dynamo_and_codecache()
+
+        self.assertEqual(fn(a, b), compiled_fn(a, b))
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
+
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_autograd_function(self):
+        """
+        Tests autograd cache hits
+        """
+
+        def fn(a, b):
+            return a.sin() + b
+
+        a = torch.randn(25, requires_grad=True)
+        b = torch.randn(25, requires_grad=True)
+        a2 = a.detach().clone().requires_grad_(True)
+        b2 = b.detach().clone().requires_grad_(True)
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        # A first call should miss in the cache.
+        self.assertEqual(fn(a, b), compiled_fn(a2, b2))
+        fn(a, b).sum().backward()
+        compiled_fn(a2, b2).sum().backward()
+        self.assertEqual(a.grad, a2.grad)
+        self.assertEqual(b.grad, b2.grad)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        # Reset all tensors
+        a = torch.randn(25, requires_grad=True)
+        b = torch.randn(25, requires_grad=True)
+        a2 = a.detach().clone().requires_grad_(True)
+        b2 = b.detach().clone().requires_grad_(True)
+
+        # A second call should hit. (First reset so in-memory guards
+        # don't prevent compilation).
+        self._clear_dynamo_and_codecache()
+        self.assertEqual(fn(a, b), compiled_fn(a2, b2))
+        fn(a, b).sum().backward()
+        compiled_fn(a2, b2).sum().backward()
+        self.assertEqual(a.grad, a2.grad)
+        self.assertEqual(b.grad, b2.grad)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+    @largeTensorTest("64GB", device=GPU_TYPE)
+    @parametrize("device", (GPU_TYPE,))
+    @parametrize("dtype", (torch.float16, torch.bfloat16))
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_autograd_inductor_guards(self, device, dtype):
+        """
+        Tests that functions that would add inductor guards are cached properly
+        """
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+        if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
+            raise unittest.SkipTest("requires CUDA SM80 or later")
+
+        def fn(x, y):
+            return (x + x, y + y)
+
+        compiled_fn = torch.compile(fn, dynamic=True)
+
+        # Iterate over different shapes, varying whether the total
+        # size is below or above int32. For each combination, we expect
+        # different guards around whether the symbolic sizes do or do
+        # not exceed int32.
+        shapes = (
+            ((5, 6), (7, 8)),
+            ((5, 6), (47000, 47001)),
+            ((47000, 47001), (5, 6)),
+        )
+        expected_hits = expected_misses = expected_saves = 0
+        for a_shape, b_shape in shapes:
+            a = torch.rand(a_shape, device=device, dtype=dtype)
+            b = torch.rand(b_shape, device=device, dtype=dtype)
+
+            # AVOID a dynamo reset here. We expect guards to have been
+            # added that will be violated with the new shape. We should
+            # see a recompilation (along with a cache miss).
+            res1 = compiled_fn(a, b)
+            # A first call should miss in the cache.
+            # NOTE: Currently, this cache miss is *not* due to guards,
+            # but instead because the AOTAutogradCache key calculation specializes on input shapes.
+            # Once we allow tensors with symints as part of the cache key calculation, it will
+            # instead cache miss because of guard failure.
+            expected_misses += 1
+            expected_saves += 1
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_miss"], expected_misses
+            )
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_hit"], expected_hits
+            )
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_saved"], expected_saves
+            )
+
+            # A second call should hit. (First reset so in-memory guards
+            # don't prevent compilation).
+
+            # Now clear dynamo and we should see a cache hit
+            # This should populate guards to dynamo's cache, so that a subsequent run with a different
+            # shape will still trigger a second call to autograd_cache.
+            self._clear_dynamo_and_codecache()
+            res2 = compiled_fn(a, b)
+            expected_hits += 1
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_miss"], expected_misses
+            )
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_hit"], expected_hits
+            )
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_saved"], expected_saves
+            )
+            self.assertEqual(res1, res2)
+
+
+@inductor_config.patch("fx_graph_cache", True)
 class AOTAutogradCachePicklerTests(torch._dynamo.test_case.TestCase):
     @property
     def device_type(self) -> str:
@@ -57,7 +308,7 @@ def gen_cache_key(self, f, config, inputs=None):
         if inputs is None:
             inputs = [torch.ones(3)]
         _, fx_g, example_inputs = self._get_dynamo_output(f, *inputs)
-        return autograd_cache_hash(fx_g, example_inputs, config)
+        return autograd_cache_key(fx_g, example_inputs, config)
 
     def test_basic_hash_key(self):
         def fn(x):