add doc for mma_fill and mma_store intrin

masahi · masahi · commit 5e086cf5fd14 · 2022-05-18T07:48:43.000+09:00
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
@@ -632,7 +632,31 @@ TVM_DLL const Op& ptx_mma_sp();
  */
 TVM_DLL const Op& ptx_ldmatrix();
 
+/*!
+ * \brief tvm intrinsic for storing the result of PTX MMA into a destination pointer.
+ *        For example, if each thread in a warp of size 32 has 4 elements from the result of
+ *        m16xn8xk16 MMA in its registers, this intrinsic can be used to store the result in a
+ *        16x8 region in shared or global memory.
+ *
+ *        There is no real PTX instruction that does that, but we want to hide details of
+ *        complex index manipulation behind this intrinsic to simplify TIR lowering passes (e.g.
+ *        LowerWarpMemory).
+ *
+ * void mma_store(IntImm m, IntImm n, Var dst_ptr, Var src_ptr, Expr src_offset, Var dst_stride);
+ */
 TVM_DLL const Op& mma_store();
+
+/*!
+ * \brief tvm intrinsic for zero-initalizing an MMA accumulation registor.
+ *        For example, if each thread in a warp of size 32 has 8 elements from the A matrix in
+ *        m16xn8xk16 MMA in its registers, this intrinsic can be used to zero-initialize its
+ *        4 accumulation registers.
+ *
+ *        There is no real PTX instruction that does that, but we introduce this intrinsic for the
+ *        same reason as mma_store above.
+ *
+ * void mma_fill(IntImm local_size, Var local_ptr, Expr offset);
+ */
 TVM_DLL const Op& mma_fill();
 
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -180,7 +180,7 @@ def tile_wmma_fragment(block_read, height, width):
     sch.tensorize(sch.get_loops(block_init_c)[-2], mma_fill_intrin)
     sch.tensorize(sch.get_loops(C_warp)[-2], mma_store_intrin)
 
-    print(sch.mod.script())
+    # print(sch.mod.script())
 
     f = tvm.build(sch.mod["main"], target="cuda", name="dense")
     dev = tvm.device("cuda", 0)