[Quant][Inductor] Bug fix: mutation nodes not handled correctly for QLinearPointwiseBinaryPT2E (pytorch#127592)

Xia-Weiwen · Xia-Weiwen · commit f9c071dddaa9 · 2024-06-12T23:21:04.000-07:00
Fixes pytorch#127402 - Revert some changes to `ir.MutationOutput` and inductor/test_flex_attention.py - Add checks of mutation for QLinearPointwiseBinaryPT2E Pull Request resolved: pytorch#127592 Approved by: https://github.com/leslie-fang-intel, https://github.com/Chillee
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
@@ -776,11 +776,13 @@ def f(q, k, v):
         metrics.reset()
         f(q, k, v)
         accessed_bytes = 1 * 8 * 1024 * 64 * torch.float32.itemsize
-        logsumexp_bytes = 1 * 8 * 1024 * torch.float32.itemsize
         num_accesses = 4  # q, k, v reads, one output.
-        self.assertEqual(
-            metrics.num_bytes_accessed, accessed_bytes * num_accesses + logsumexp_bytes
-        )
+        # TODO: Get rid of this fudge factor
+        # We need this fudge factor for now, since
+        # 1. For some reason we materialize the output of the attention unnecessarily (it's related to the mutation somehow)
+        # 2. We also write the extraneous logsumexp
+        num_accesses += 2
+        self.assertLess(metrics.num_bytes_accessed, accessed_bytes * num_accesses)
 
     @supported_platform
     @skip("Triton bug ")  # https://github.com/pytorch/pytorch/issues/124571
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -233,6 +233,7 @@ def _test_code_common(
         rtol=1.3e-6,
         check_quantization=False,
         check_dynamic=None,
+        num_include_ops=None,
     ):
         with torch.no_grad():
             clone_inputs = self._clone_inputs(inputs)
@@ -245,6 +246,12 @@ def _test_code_common(
             )
             for op in include_ops:
                 self.assertIn(op, source_code)
+            if num_include_ops is not None:
+                assert len(include_ops) == len(num_include_ops)
+                for i in range(len(include_ops)):
+                    self.assertEqual(
+                        source_code.count(include_ops[i]), num_include_ops[i]
+                    )
             for op in exclude_ops:
                 self.assertNotIn(op, source_code)
             if check_dynamic is not None:
@@ -1808,6 +1815,32 @@ def matcher_check_fn():
                     matcher_check_fn=matcher_check_fn,
                     is_qat=is_qat,
                 )
+                if torch._inductor.config.cpp_wrapper:
+                    # For CPP wrapper
+                    self._test_code_common(
+                        mod,
+                        (v,),
+                        [
+                            "op_qlinear_pointwise.call",
+                            "op_qlinear_pointwise_binary.call",
+                        ],
+                        [],
+                        check_quantization=True,
+                        num_include_ops=[2, 2],
+                    )
+                else:
+                    # For python wrapper
+                    self._test_code_common(
+                        mod,
+                        (v,),
+                        [
+                            "torch.ops.onednn.qlinear_pointwise.default",
+                            "torch.ops.onednn.qlinear_pointwise.binary",
+                        ],
+                        [],
+                        check_quantization=True,
+                        num_include_ops=[2, 2],
+                    )
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -4775,15 +4775,10 @@ def get_mutation_names(self):
 
     def __init__(self, layout, mutated_node, node_doing_mutating):
         # NB: Do not directly construct this - use `mark_node_as_mutating`
-        super().__init__(None, layout, [mutated_node], ())
+        super().__init__(None, layout, [mutated_node, node_doing_mutating], ())
         self.node_doing_mutating = node_doing_mutating
         self.name = V.graph.register_buffer(self)
 
-    def get_read_writes(self):
-        read_writes = super().get_read_writes()
-        read_writes.reads.add(dependencies.WeakDep(self.node_doing_mutating.get_name()))
-        return read_writes
-
     def should_allocate(self):
         return False