tensorized C_warp init

masahi · masahi · commit e08df2a62a48 · 2022-05-17T10:28:34.000+09:00
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
@@ -633,6 +633,7 @@ TVM_DLL const Op& ptx_mma_sp();
 TVM_DLL const Op& ptx_ldmatrix();
 
 TVM_DLL const Op& mma_store();
+TVM_DLL const Op& mma_fill();
 
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -822,16 +822,24 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
                                             smem_ptr, smem_elem_offset);
   } else if (op->op.same_as(builtin::mma_store())) {
-    std::string dst = this->PrintExpr(op->args[1]);
-    std::string src = this->PrintExpr(op->args[2]);
-    std::string src_offset = this->PrintExpr(op->args[3]);
-    std::string stride = this->PrintExpr(op->args[4]);
+    std::string dst = this->PrintExpr(op->args[2]);
+    std::string src = this->PrintExpr(op->args[3]);
+    std::string src_offset = this->PrintExpr(op->args[4]);
+    std::string stride = this->PrintExpr(op->args[5]);
 
     os << "for (int i = 0; i < 4; ++i) {\n";
     os << dst << "[(i / 2 * 8 + threadIdx.x / 4) * " << stride
        << " + (threadIdx.x % 4) * 2 + i % 2]"
        << " = " << src << "[" << src_offset << " + i];\n";
     os << "}\n";
+  } else if (op->op.same_as(builtin::mma_fill())) {
+    std::string num_elem = this->PrintExpr(op->args[0]);
+    std::string dst = this->PrintExpr(op->args[1]);
+    std::string dst_offset = this->PrintExpr(op->args[2]);
+
+    os << "for (int i = 0; i < " << num_elem << "; ++i) {\n";
+    os << dst << "[" << dst_offset << " + i] = 0.0;" ;
+    os << "}\n";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
@@ -250,6 +250,9 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_ldmatrix)
 TIR_DEFINE_BUILTIN_FUNC(mma_store)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(mma_fill)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(vectorhigh)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc
@@ -117,7 +117,13 @@ class WarpStoreCoeffFinder : private StmtExprVisitor {
     if (op->op.same_as(builtin::ptx_ldmatrix()) && op->args[3].as<VarNode>() == buffer_) {
       int num_matrix = op->args[1].as<IntImmNode>()->value;
       warp_coeff_ = num_matrix * 2;
+    } else if (op->op.same_as(builtin::mma_fill()) && op->args[1].as<VarNode>() == buffer_) {
+      LOG(INFO) << op->args[0];
+      auto* ptr = op->args[0].as<IntImmNode>();
+      CHECK(ptr);
+      warp_coeff_ = ptr->value;;
     }
+
     StmtExprVisitor::VisitExpr_(op);
   }
 
@@ -284,9 +290,20 @@ class WarpAccessRewriter : protected StmtExprMutator {
     if (op->op.same_as(builtin::mma_store())) {
       Array<PrimExpr> new_args = op->args;
       PrimExpr local_offset, group;
-      if (op->args[2].get() == buffer_) {
-        std::tie(local_offset, group) = SplitIndexByGroup(op->args[3]);
-        new_args.Set(3, local_offset);
+      if (op->args[3].get() == buffer_) {
+        std::tie(local_offset, group) = SplitIndexByGroup(op->args[4]);
+        new_args.Set(4, local_offset);
+        return Call(op->dtype, op->op, new_args);
+      }
+      return GetRef<PrimExpr>(op);
+    }
+
+    if (op->op.same_as(builtin::mma_fill())) {
+      Array<PrimExpr> new_args = op->args;
+      PrimExpr local_offset, group;
+      if (op->args[1].get() == buffer_) {
+        std::tie(local_offset, group) = SplitIndexByGroup(op->args[2]);
+        new_args.Set(2, local_offset);
         return Call(op->dtype, op->op, new_args);
       }
       return GetRef<PrimExpr>(op);
diff --git a/tests/python/unittest/test_mma_16x8x8_4k_tune.py b/tests/python/unittest/test_mma_16x8x8_4k_tune.py
@@ -196,13 +196,43 @@ def mma_store_impl(a: T.handle, c: T.handle) -> None:
         tx = T.env_thread("threadIdx.x")
         T.launch_thread(tx, 32)
 
-        T.evaluate(T.mma_store("m16n8", C.access_ptr("w"), C_warp.data, C_warp.elem_offset, s1, dtype="float32"))
+        T.evaluate(T.mma_store(16, 8, C.access_ptr("w"), C_warp.data, C_warp.elem_offset, s1, dtype="float32"))
+
+
+@T.prim_func
+def mma_fill_desc(a: T.handle) -> None:
+    C_warp = T.match_buffer(a, [32, 4], dtype="float32", scope="warp")
+
+    with T.block("root"):
+        T.reads()
+        T.writes(C_warp[0:32, 0:4])
+        for i0, i1 in T.grid(32, 4):
+            with T.block("C_warp"):
+                i_init = T.axis.spatial(16, i1 // 2 * 8 + i0 // 4)
+                j_init = T.axis.spatial(8, (i0 % 4) * 2 + i1 % 2)
+                T.reads()
+                T.writes(C_warp[i_init % 8 * 4 + j_init % 8 // 2, i_init % 16 // 8 * 2 + j_init % 2])
+                C_warp[i_init % 8 * 4 + j_init % 8 // 2, i_init % 16 // 8 * 2 + j_init % 2] = T.float32(0)
+
+
+@T.prim_func
+def mma_fill_impl(a: T.handle) -> None:
+    C_warp = T.match_buffer(a, [32, 4], dtype="float32", scope="warp", offset_factor=1)
+
+    with T.block("root"):
+        T.reads()
+        T.writes(C_warp[0:32, 0:4])
+        tx = T.env_thread("threadIdx.x")
+        T.launch_thread(tx, 32)
+
+        T.evaluate(T.mma_fill(4, C_warp.data, C_warp.elem_offset, dtype="float32"))
 
 
 tir.TensorIntrin.register("mma.ldmatrix_a", ldmatrix_a_desc, ldmatrix_a_impl)
 tir.TensorIntrin.register("mma.ldmatrix_b", ldmatrix_b_desc, ldmatrix_b_impl)
 tir.TensorIntrin.register("mma_sync", mma_sync_desc, mma_sync_impl)
 tir.TensorIntrin.register("mma_store", mma_store_desc, mma_store_impl)
+tir.TensorIntrin.register("mma_fill", mma_fill_desc, mma_fill_impl)
 
 N = 4096
 M = 4096
@@ -381,7 +411,8 @@ def lambda_b(i, j):
     sch.reorder(f_1, f_2, f_0, f_3)
     fused_1 = sch.fuse(f_1, f_2)
     fused_2 = sch.fuse(f_0, f_3)
-    sch.bind(fused_1, "threadIdx.x")
+    # sch.bind(fused_1, "threadIdx.x")
+    sch.tensorize(fused_1, "mma_fill")
 
     warp_loop1, warp_loop2 = sch.get_loops(C_warp)[-2:]
     f_0, f_1 = sch.split(warp_loop1, factors=[None, 8])
@@ -394,7 +425,6 @@ def lambda_b(i, j):
     # return
 
     sch.tensorize(fused_1, "mma_store")
-    # sch.bind(fused_1, "threadIdx.x")
 
 
 ir_module = tvm.IRModule({"main": workload})
@@ -440,7 +470,7 @@ def lambda_b(i, j):
 tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
 print("ok")
 
-# evaluator = f.time_evaluator(f.entry_name, dev, number=1000)
-# gflops = (N * M * K) * 2 / 1e9
-# time_ms = evaluator(a, b, c).mean * 1e3
-# print("matmul with tensor core: %f ms, %f GFLOPS" % (time_ms, gflops / (time_ms / 1e3)))
+evaluator = f.time_evaluator(f.entry_name, dev, number=1000)
+gflops = (N * M * K) * 2 / 1e9
+time_ms = evaluator(a, b, c).mean * 1e3
+print("matmul with tensor core: %f ms, %f GFLOPS" % (time_ms, gflops / (time_ms / 1e3)))