apache · minminsun · Oct 10, 2019 · were · Oct 19, 2019 · were
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
@@ -1001,6 +1001,9 @@ class Realize : public StmtNode {
   /*! \brief The body of realization. */
   Stmt body;
 
+  Expr new_expr;
+  std::string free_function;
+
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
@@ -1015,7 +1018,9 @@ class Realize : public StmtNode {
                            DataType type,
                            Region bounds,
                            Expr condition,
-                           Stmt body);
+                           Stmt body,
+                           Expr new_expr = Expr(),
+                           std::string free_function = std::string());
 
   static constexpr const char* _type_key = "Realize";
   TVM_DECLARE_NODE_TYPE_INFO(Realize, StmtNode);

diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
@@ -207,6 +207,12 @@ Stmt StorageFlatten(Stmt stmt,
                     int cache_line_size,
                     bool create_bound_attribute = false);
 
+Stmt TensorCore(Stmt stmt,
+                Schedule schedule,
+                double cuda_compute_capability,
+                double cuda_version,
+                Map<Tensor, Buffer> extern_buffer);
+
 /*!
  * \brief Remove No Op from the Stmt.
  * \param stmt The stmt to be trasnformed

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
@@ -378,6 +378,18 @@ def lower(sch,
     for f in lower_phase0:
         stmt = f(stmt)
     # Phase 1
+    try:
+        # device_type 2 for GPU
+        # choose device 0
+        # attr type 4 for CUDA Compute Capability
+        cuda_compute_capability = _api_internal._GetDeviceAttr(2, 0, 4)
+        from tvm.contrib.nvcc import find_cuda_path, get_cuda_version
+        cuda_version = float(get_cuda_version(find_cuda_path()))
+    except:
+        cuda_compute_capability = None
+    if cuda_compute_capability and float(cuda_compute_capability) >= 7.0 and cuda_version >= 9.0:
+        stmt = ir_pass.TensorCore(stmt, sch, float(cuda_compute_capability), float(cuda_version), binds)
+
     stmt = ir_pass.StorageFlatten(stmt, binds, 64, cfg.instrument_bound_checkers)
     stmt = ir_pass.CanonicalSimplify(stmt)
     for f in lower_phase1:

diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
@@ -94,6 +94,13 @@ TVM_REGISTER_API("ir_pass.StorageFlatten")
     }
   });
 
+TVM_REGISTER_API("ir_pass.TensorCore")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    if (args.size() == 5) {
+      *ret = TensorCore(args[0], args[1], args[2], args[3], args[4]);
+    }
+  });
+
 TVM_REGISTER_API("ir_pass.AttrsEqual")
 .set_body_typed<bool(const NodeRef&, const NodeRef&)>([](const NodeRef& lhs, const NodeRef& rhs) {
     return AttrsEqual()(lhs, rhs);

diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
@@ -582,6 +582,27 @@ void CodeGenC::VisitExpr_(const Call *op, std::ostream& os) {  // NOLINT(*)
     os << " != ";
     this->PrintExpr(op->args[0], os);
     os << ")";
+  } else if (op->is_intrinsic("wmma::mem_col_major")) {
+    CHECK_EQ(op->args.size(), 0U);
+    os << "wmma::mem_col_major";
+  } else if (op->is_intrinsic("wmma::mem_row_major")) {
+    CHECK_EQ(op->args.size(), 0U);
+    os << "wmma::mem_row_major";
+  } else if (op->is_intrinsic("wmma::col_major")) {
+    CHECK_EQ(op->args.size(), 0U);
+    os << "wmma::col_major";
+  } else if (op->is_intrinsic("wmma::row_major")) {
+    CHECK_EQ(op->args.size(), 0U);
+    os << "wmma::row_major";
+  } else if (op->is_intrinsic("wmma::matrix_a")) {
+    CHECK_EQ(op->args.size(), 0U);
+    os << "wmma::matrix_a";
+  } else if (op->is_intrinsic("wmma::matrix_b")) {
+    CHECK_EQ(op->args.size(), 0U);
+    os << "wmma::matrix_b";
+  } else if (op->is_intrinsic("wmma::accumulator")) {
+    CHECK_EQ(op->args.size(), 0U);
+    os << "wmma::accumulator";
   } else {
     if (op->call_type == Call::Intrinsic ||
         op->call_type == Call::PureIntrinsic) {
@@ -781,12 +802,49 @@ void CodeGenC::VisitStmt_(const Allocate* op) {
   CHECK(!is_zero(op->condition));
   std::string vid = AllocVarID(op->buffer_var.get());
   if (op->new_expr.defined()) {
-    // Prefer global static allocation for the program
-    CHECK_EQ(op->free_function, "nop");
-    std::string new_data = PrintExpr(op->new_expr);
-    this->PrintIndent();
-    PrintType(op->type, stream);
-    stream << "* "<< vid << '=' << new_data << ";\n";
+    const Call* call = op->new_expr.as<Call>();
+    if (call != nullptr && call->is_intrinsic("wmma::fragment")) {
+      CHECK_EQ(call->args.size(), 5);
+      this->PrintIndent();
+      this->stream << "wmma::fragment<" << PrintExpr(call->args[0])
+                                 << "," << PrintExpr(call->args[1])
+                                 << "," << PrintExpr(call->args[2])
+                                 << "," << PrintExpr(call->args[3]);
+
+      std::string type_str;
+      if (op->type == Int(8)) {
+        type_str = "signed char";
+      } else if (op->type == Int(32)) {
+        type_str = "int";
+      } else if (op->type == Float(16)) {
+        type_str = "half";
+      } else if (op->type == Float(32)) {
+        type_str = "float";
+      } else {
+        LOG(FATAL) << "Fragement type " << op->type
+                   << " is not supported for now";
+      }
+      this->stream << "," << type_str;
+
+      std::string arg0 = PrintExpr(call->args[0]);
+      if (arg0 == "wmma::matrix_a"
+          || arg0 == "wmma::matrix_b") {
+        this->stream << ", " << PrintExpr(call->args[4]) << "> ";
+      } else if (arg0 == "wmma::accumulator") {
+        this->stream << "> ";
+      } else {
+        LOG(FATAL) << "Invalid fragment " << PrintExpr(arg0);
+      }
+      int32_t constant_size = op->constant_allocation_size();
+      this->stream << vid << "[" << constant_size << "];\n";
+    } else {
+      // Prefer global static allocation for the program
+      CHECK_EQ(op->free_function, "nop");
+      std::string new_data = PrintExpr(op->new_expr);
+      this->PrintIndent();
+      PrintType(op->type, stream);
+      stream << "* "<< vid << '=' << new_data << ";\n";
+    }
   } else {
     this->PrintIndent();
     int32_t constant_size = op->constant_allocation_size();

diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
@@ -50,10 +50,14 @@ void CodeGenCUDA::AddFunction(LoweredFunc f) {
 std::string CodeGenCUDA::Finish() {
   if (enable_fp16_) {
     decl_stream << "#include <cuda_fp16.h>\n";
+    decl_stream << "#include <mma.h>\n";
+    decl_stream << "using namespace nvcuda;\n";
   }
 
   if (enable_int8_) {
     decl_stream << "#include <sm_61_intrinsics.h>\n";
+    decl_stream << "#include <mma.h>\n";
+    decl_stream << "using namespace nvcuda;\n";
   }
 
   if (need_math_constants_h_) {
@@ -88,8 +92,15 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
-      case 16: os << "half";
+      case 16:
         enable_fp16_ = true;
+        switch (lanes) {
+          case 1: os << "half"; return;
+          case 2: os << "float"; return;
+          case 4: os << "float2"; return;
+          case 8: os << "float4"; return;
+          default: fail = true;
+        }
         break;
       case 32: os << "float"; break;
       case 64: os << "double"; break;

diff --git a/src/lang/ir.cc b/src/lang/ir.cc
@@ -467,7 +467,9 @@ Stmt Realize::make(FunctionRef func,
                    DataType type,
                    Region bounds,
                    Expr condition,
-                   Stmt body) {
+                   Stmt body,
+                   Expr new_expr,
+                   std::string free_function) {
   for (size_t i = 0; i < bounds.size(); ++i) {
     CHECK(bounds[i]->min.defined());
     CHECK(bounds[i]->extent.defined());
@@ -485,6 +487,8 @@ Stmt Realize::make(FunctionRef func,
   node->bounds = std::move(bounds);
   node->condition = std::move(condition);
   node->body = std::move(body);
+  node->new_expr = std::move(new_expr);
+  node->free_function = std::move(free_function);
   return Stmt(node);
 }
 

diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc
@@ -115,7 +115,7 @@ class DoubleBufferInjector : public IRMutator {
           Evaluate::make(0)));
       alloc_nest.emplace_back(Allocate::make(
           op->buffer_var, op->type, new_extents, op->condition,
-          Evaluate::make(0)));
+          Evaluate::make(0), op->new_expr, op->free_function));
       return op->body;
     } else {
       return IRMutator::Mutate_(op, s);

diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
@@ -242,15 +242,17 @@ class StorageFlattener : public IRMutator {
         ret = Allocate::make(
             e.buffer->data, storage_type,
             {e.buffer->strides[first_dim] * e.buffer->shape[first_dim]},
-            make_const(Bool(e.buffer->dtype.lanes()), true), body);
+            make_const(Bool(e.buffer->dtype.lanes()), true), body,
+            op->new_expr, op->free_function);
       } else {
         shape = e.buffer->shape;
         if (shape.size() == 0) {
           shape.push_back(make_const(Int(32), 1));
         }
         ret = Allocate::make(
             e.buffer->data, storage_type, shape,
-            make_const(Bool(e.buffer->dtype.lanes()), true), body);
+            make_const(Bool(e.buffer->dtype.lanes()), true), body,
+            op->new_expr, op->free_function);
       }
       ret = AttrStmt::make(
           e.buffer->data, attr::storage_scope,

diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
@@ -577,7 +577,8 @@ class StoragePlanRewriter : public IRMutator {
                                               make_const(Int(32), 1));
           e->new_alloc = Allocate::make(
               e->alloc_var, alloc_type, {sz},
-              e->allocs[0]->condition, Evaluate::make(0));
+              e->allocs[0]->condition, Evaluate::make(0),
+              e->allocs[0]->new_expr, e->allocs[0]->free_function);
           if (e->scope.tag.length() != 0) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
@@ -967,7 +968,8 @@ class VectorAllocRewriter : public IRMutator {
                     extents[extents.size() - 1] / make_const(extents[0].type(), factor));
         return Allocate::make(
             op->buffer_var, tvec[0], extents,
-            op->condition, op->body);
+            op->condition, op->body,
+            op->new_expr, op->free_function);
       }
     }
     return stmt;