[Unity][Op] Group normalization (#14194)

MasterJH5574 · spectrometerHBH · tqchen · commit 22b65bc6eb6a · 2023-03-05T08:38:12.000-05:00
* [TOPI] Group normalization As more and more ML models nowadays contain the group normalization computation, we find it beneficial to introduce this op to TOPI level. It will enable us to optimize the group normalization operation as a whole in a more convenient way. This PR introduces the group normalization op to TOPI. The group norm operation was introduced in https://arxiv.org/abs/1803.08494. The implementation uses tuple reduction, same as the implementation of layer norm. Implemented with tuple reduction, the corresponding generated TIR function can be optimized by cross-thread reduction or rfactor through MetaSchedule. Prior to this PR, the group normalization operations in frontend models are translated to a series of operations, which brings inconvenience when we want to optimize the group norm op as a whole. With the TOPI implementation of group norm being introduced by #14193, we can now use it to legalize the high-level group norm op and optimize it using cross-thread reduction or rfactor via MetaSchedule. Co-authored-by: Bohan Hou <spectrometerh@gmail.com>
diff --git a/include/tvm/relax/attrs/nn.h b/include/tvm/relax/attrs/nn.h
@@ -174,6 +174,27 @@ struct LayerNormAttrs : public tvm::AttrsNode<LayerNormAttrs> {
   }
 };  // struct LayerNormAttrs
 
+/*! \brief Attributes used in group_norm operator */
+struct GroupNormAttrs : public tvm::AttrsNode<GroupNormAttrs> {
+  int num_groups;
+  int channel_axis;
+  Array<Integer> axes;
+  double epsilon;
+  bool center;
+  bool scale;
+
+  TVM_DECLARE_ATTRS(GroupNormAttrs, "relax.attrs.GroupNormAttrs") {
+    TVM_ATTR_FIELD(num_groups).describe("The number of groups to separate the channels into.");
+    TVM_ATTR_FIELD(channel_axis).describe("The axis that represents the channel.");
+    TVM_ATTR_FIELD(axes).describe(
+        "The axes that along which the normalization is applied (excluding the channel axis).");
+    TVM_ATTR_FIELD(epsilon).describe("Small float added to variance to avoid dividing by zero");
+    TVM_ATTR_FIELD(center).describe(
+        "Indicating if the beta offset will be added to the normalized tensor.");
+    TVM_ATTR_FIELD(scale).describe("Indicating if the gamma scale will be multiplied.");
+  }
+};  // struct GroupNormAttrs
+
 /*! \brief Attributes used in dropout operator */
 struct DropoutAttrs : public tvm::AttrsNode<DropoutAttrs> {
   double rate;
diff --git a/python/tvm/relax/frontend/torch/fx_translator.py b/python/tvm/relax/frontend/torch/fx_translator.py
@@ -465,44 +465,30 @@ def _layer_norm(self, node: fx.node.Node) -> relax.Var:
         )
 
     def _group_norm(self, node: fx.node.Node) -> relax.Var:
-        # torch.nn.GroupNorm(num_groups, num_channels, eps=1e-05,
-        #                    affine=True, device=None, dtype=None)
+        import torch  # type: ignore
+
         x = self.env[node.args[0]]
         module = self.named_modules[node.target]
-        num_groups = module.num_groups
-        num_channels = module.num_channels
-        eps = module.eps
-        affine = module.affine
 
-        shape = self.shape_of(x)
-        assert len(shape) == 4
-        N, C, H, W = shape[0], shape[1], shape[2], shape[3]
-        assert C == num_channels
-        assert C % num_groups == 0
-        grouped_x = self.block_builder.emit(
-            relax.op.reshape(x, [N, num_groups, C // num_groups, H, W])
-        )
-        mean_x = self.block_builder.emit(relax.op.mean(grouped_x, [2, 3, 4], keepdims=True))
-        sub_x = self.block_builder.emit(relax.op.subtract(grouped_x, mean_x))
-        square_x = self.block_builder.emit(relax.op.multiply(sub_x, sub_x))
-        sum_square_x = self.block_builder.emit(relax.op.sum(square_x, [2, 3, 4], keepdims=True))
-        var_x = self._call_binary_op(relax.op.divide, sum_square_x, (C // num_groups * H * W).value)
-        var_x_eps = self._call_binary_op(relax.op.add, var_x, eps)
-        std_x = self.block_builder.emit(relax.op.sqrt(var_x_eps))
-        norm_x = self.block_builder.emit(relax.op.divide(sub_x, std_x))
-
-        if affine:
-            weight = self.params[module.weight]
-            bias = self.params[module.bias]
-            weight_reshape = self.block_builder.emit(
-                relax.op.reshape(weight, (1, num_groups, C // num_groups, 1, 1))
-            )
-            bias_reshape = self.block_builder.emit(
-                relax.op.reshape(bias, (1, num_groups, C // num_groups, 1, 1))
+        if module.affine:
+            gamma = self.params[module.weight]
+            beta = self.params[module.bias]
+        else:
+            gamma = relax.const(torch.ones_like(module.num_channels), x.checked_type)
+            beta = relax.const(torch.zeros_like(module.num_channels), x.checked_type)
+
+        dim = len(self.shape_of(x))
+        return self.block_builder.emit(
+            relax.op.nn.group_norm(
+                x,
+                gamma,
+                beta,
+                num_groups=module.num_groups,
+                channel_axis=1,
+                axes=list(range(2, dim)),
+                epsilon=module.eps,
             )
-            norm_x = self.block_builder.emit(relax.op.multiply(norm_x, weight_reshape))
-            norm_x = self.block_builder.emit(relax.op.add(norm_x, bias_reshape))
-        return self.block_builder.emit(relax.op.reshape(norm_x, (N, C, H, W)))
+        )
 
     def _embedding(self, node: fx.node.Node) -> relax.Var:
         x = self.env[node.args[0]]
diff --git a/python/tvm/relax/op/nn/nn.py b/python/tvm/relax/op/nn/nn.py
@@ -527,6 +527,64 @@ def layer_norm(
     return _ffi_api.layer_norm(data, gamma, beta, axes, epsilon, center, scale)  # type: ignore
 
 
+def group_norm(
+    data: Expr,
+    gamma: Expr,
+    beta: Expr,
+    num_groups: int,
+    channel_axis: int,
+    axes: Union[int, List[int]],
+    epsilon: float = 1e-5,
+    center: bool = True,
+    scale: bool = True,
+) -> Expr:
+    r"""
+    Group normalization (Yuxin Wu and et al., 2016).
+    Applies group normalization to the n-dimensional input array.
+    This operator takes an n-dimensional input array. First separate the input array
+    into groups along the channel axis. Then apply layer normalization to each group.
+
+    Parameters
+        ----------
+    data : relax.Expr
+        Input to which group_norm will be applied.
+
+    gamma : relax.Expr
+        The gamma scale factor.
+
+    beta : relax.Expr
+        The beta offset factor.
+
+    num_groups : int
+        Number of groups to separate the channels into.
+
+    channel_axis : int
+        The index of the channel axis in the input data.
+
+    axes : Union[int, List[int]]
+        The axes that along which the normalization is applied (excluding the group axis)
+
+    epsilon : float
+        Small float added to variance to avoid dividing by zero.
+
+    center : bool
+        Indicating if the beta offset will be added to the normalized tensor.
+
+    scale : bool
+        Indicating if the gamma scale will be multiplied.
+
+    Returns
+    -------
+    result : relax.Expr
+        The computed result.
+    """
+    if isinstance(axes, int):
+        axes = [axes]
+    return _ffi_api.group_norm(  # type: ignore
+        data, gamma, beta, num_groups, channel_axis, axes, epsilon, center, scale
+    )
+
+
 def dropout(data: Expr, rate: float = 0.5) -> Expr:
     """Applies the dropout operation to the input tensor.
 
diff --git a/python/tvm/relax/transform/legalize_ops/nn.py b/python/tvm/relax/transform/legalize_ops/nn.py
@@ -196,6 +196,20 @@ def _nn_layer_norm(bb: BlockBuilder, call: Call) -> Expr:
     )
 
 
+@register_legalize("relax.nn.group_norm")
+def _nn_group_norm(bb: BlockBuilder, call: Call) -> Expr:
+    return bb.call_te(
+        topi.nn.group_norm,
+        call.args[0],
+        call.args[1],
+        call.args[2],
+        call.attrs.num_groups,
+        call.attrs.channel_axis,
+        call.attrs.axes,
+        call.attrs.epsilon,
+    )
+
+
 @register_legalize("relax.nn.dropout")
 def _nn_dropout(bb: BlockBuilder, call: Call) -> Expr:
     logging.info("Dropout is handled by frontend translator at this moment and is not legalized.")
diff --git a/src/relax/op/nn/nn.cc b/src/relax/op/nn/nn.cc
@@ -233,6 +233,89 @@ TVM_REGISTER_OP("relax.nn.layer_norm")
     .add_argument("beta", "Tensor", "The beta offset factor.")
     .set_attr<FInferStructInfo>("FInferStructInfo", InferStructInfoLayerNorm);
 
+/* relax.nn.group_norm */
+TVM_REGISTER_NODE_TYPE(GroupNormAttrs);
+
+Expr group_norm(Expr data, Expr gamma, Expr beta, int num_groups, int channel_axis,
+                Array<Integer> axes, double epsilon, bool center, bool scale) {
+  ObjectPtr<GroupNormAttrs> attrs = make_object<GroupNormAttrs>();
+  attrs->num_groups = num_groups;
+  attrs->channel_axis = channel_axis;
+  attrs->axes = std::move(axes);
+  attrs->epsilon = epsilon;
+  attrs->center = center;
+  attrs->scale = scale;
+
+  static const Op& op = Op::Get("relax.nn.group_norm");
+  return Call(op, {std::move(data), std::move(gamma), std::move(beta)}, Attrs{attrs}, {});
+}
+
+TVM_REGISTER_GLOBAL("relax.op.nn.group_norm").set_body_typed(group_norm);
+
+StructInfo InferStructInfoGroupNorm(const Call& call, const BlockBuilder& ctx) {
+  Op op = Downcast<Op>(call->op);
+  Array<TensorStructInfo> input_sinfo = GetInputTensorStructInfo(call, ctx);
+  const auto* attrs = call->attrs.as<GroupNormAttrs>();
+
+  TensorStructInfo data_sinfo = input_sinfo[0];
+  int channel_axis = -1;
+  if (!data_sinfo->IsUnknownNdim()) {
+    channel_axis = NormalizeAxis(call, ctx, data_sinfo->ndim, attrs->channel_axis);
+    std::vector<int> axes = NormalizeAxes(call, ctx, data_sinfo->ndim, attrs->axes);
+    // channel_axis must be in axes.
+    if (std::find(axes.begin(), axes.end(), channel_axis) != axes.end()) {
+      ctx->ReportFatal(Diagnostic::Error(call)
+                       << op
+                       << " expects that channel_axis must not be in axes, but got channel_axis: "
+                       << channel_axis << ", axes: " << attrs->axes);
+    }
+  }
+  if (!data_sinfo->IsUnknownDtype() && !data_sinfo->dtype.is_float()) {
+    ctx->ReportFatal(Diagnostic::Error(call)
+                     << op << " expects that data must be float, but got " << data_sinfo->dtype);
+  }
+  arith::Analyzer* analyzer = ctx->GetAnalyzer();
+  const auto* data_shape = data_sinfo->shape.as<ShapeExprNode>();
+  if (data_shape != nullptr && channel_axis != -1 &&
+      analyzer->CanProve(floormod(data_shape->values[channel_axis], attrs->num_groups) != 0)) {
+    ctx->ReportFatal(Diagnostic::Error(call)
+                     << op << " expects that the size of channel_axis must be divisible by "
+                     << attrs->num_groups << ", but got " << data_shape->values[channel_axis]);
+  }
+  for (int i = 1; i < static_cast<int>(op->arguments.size()); ++i) {
+    if (input_sinfo[i]->dtype != data_sinfo->dtype) {
+      ctx->ReportFatal(Diagnostic::Error(call)
+                       << op << " expects that all inputs must have the same dtype, but got "
+                       << input_sinfo[i]->dtype << " and " << data_sinfo->dtype);
+    } else if (input_sinfo[i]->ndim != 1) {
+      ctx->ReportFatal(Diagnostic::Error(call)
+                       << op << " expects that all inputs must have ndim=1, but got "
+                       << input_sinfo[i]->ndim);
+    } else if (channel_axis != -1) {
+      const auto* shape = input_sinfo[i]->shape.as<ShapeExprNode>();
+      if (shape != nullptr && data_shape != nullptr) {
+        PrimExpr channel_size = data_shape->values[channel_axis];
+        PrimExpr input_size = shape->values[0];
+        if (analyzer->CanProve(channel_size != input_size)) {
+          ctx->ReportFatal(Diagnostic::Error(call)
+                           << op << " expects that the size of input " << i
+                           << " must be equal to the size of channel_axis, but got " << input_size
+                           << " and " << channel_size);
+        }
+      }
+    }
+  }
+  return data_sinfo;
+}
+
+TVM_REGISTER_OP("relax.nn.group_norm")
+    .set_attrs_type<GroupNormAttrs>()
+    .set_num_inputs(3)
+    .add_argument("data", "Tensor", "Input to which batch_norm will be applied.")
+    .add_argument("gamma", "Tensor", "The gamma scale factor.")
+    .add_argument("beta", "Tensor", "The beta offset factor.")
+    .set_attr<FInferStructInfo>("FInferStructInfo", InferStructInfoGroupNorm);
+
 /* relax.nn.dropout */
 TVM_REGISTER_NODE_TYPE(DropoutAttrs);
 
diff --git a/src/relax/op/nn/nn.h b/src/relax/op/nn/nn.h
@@ -68,6 +68,10 @@ Expr batch_norm(Expr data, Expr gamma, Expr beta, Expr moving_mean, Expr moving_
 Expr layer_norm(Expr data, Expr gamma, Expr beta, Array<Integer> axes, double epsilon, bool center,
                 bool scale);
 
+/*! \brief Compute group normalization. */
+Expr group_norm(Expr data, Expr gamma, Expr beta, int num_groups, int channel_axis,
+                Array<Integer> axes, double epsilon, bool center, bool scale);
+
 /*!
  * \brief Applies the dropout operation to the input tensor.
  * \param data The input data to the operator.
diff --git a/tests/python/relax/test_ast_printer.py b/tests/python/relax/test_ast_printer.py
@@ -362,7 +362,7 @@ def f(
         y: R.Tensor(("m",), "float32"),
         r: R.Tensor(dtype="int64"),
     ) -> R.Object:
-        m = T.var("int64")
+        m = T.int64()
         z: R.Tensor((32, m), "float32") = R.multiply(x, y)
         w: R.Tensor = R.multiply(z, z)
         q: R.Tensor(ndim=2) = R.add(w, w)
@@ -431,7 +431,7 @@ def test_call_tir():
     # also from test_parser
     @R.function
     def foo(x: R.Tensor(("m", "n"), "float32")):
-        m, n = T.var("int64"), T.var("int64")
+        m, n = T.int64(), T.int64()
         gv0 = R.call_tir("test.op.identity", (x,), R.Tensor((m, n), dtype="float32"))
         return gv0
 
diff --git a/tests/python/relax/test_frontend_from_fx.py b/tests/python/relax/test_frontend_from_fx.py
@@ -708,29 +708,19 @@ def main(
             w1: R.Tensor((3,), dtype="float32"),
             w2: R.Tensor((3,), dtype="float32"),
         ) -> R.Tensor((1, 3, 10, 10), dtype="float32"):
-            # block 0
             with R.dataflow():
-                lv: R.Tensor((1, 3, 1, 10, 10), dtype="float32") = R.reshape(
-                    input_1, (1, 3, 1, 10, 10)
-                )
-                lv1: R.Tensor((1, 3, 1, 1, 1), dtype="float32") = R.mean(
-                    lv, axis=[2, 3, 4], keepdims=True
-                )
-                lv2: R.Tensor((1, 3, 1, 10, 10), dtype="float32") = R.subtract(lv, lv1)
-                lv3: R.Tensor((1, 3, 1, 10, 10), dtype="float32") = R.multiply(lv2, lv2)
-                lv4: R.Tensor((1, 3, 1, 1, 1), dtype="float32") = R.sum(
-                    lv3, axis=[2, 3, 4], keepdims=True
+                lv: R.Tensor((1, 3, 10, 10), dtype="float32") = R.nn.group_norm(
+                    input_1,
+                    w1,
+                    w2,
+                    num_groups=3,
+                    channel_axis=1,
+                    axes=[2, 3],
+                    epsilon=1.0000000000000001e-05,
+                    center=True,
+                    scale=True,
                 )
-                lv5: R.Tensor((1, 3, 1, 1, 1), dtype="float32") = R.divide(lv4, R.const(100.0))
-                lv6: R.Tensor((1, 3, 1, 1, 1), dtype="float32") = R.add(lv5, R.const(1e-05))
-                lv7: R.Tensor((1, 3, 1, 1, 1), dtype="float32") = R.sqrt(lv6)
-                lv8: R.Tensor((1, 3, 1, 10, 10), dtype="float32") = R.divide(lv2, lv7)
-                lv9: R.Tensor((1, 3, 1, 1, 1), dtype="float32") = R.reshape(w1, (1, 3, 1, 1, 1))
-                lv10: R.Tensor((1, 3, 1, 1, 1), dtype="float32") = R.reshape(w2, (1, 3, 1, 1, 1))
-                lv11: R.Tensor((1, 3, 1, 10, 10), dtype="float32") = R.multiply(lv8, lv9)
-                lv12: R.Tensor((1, 3, 1, 10, 10), dtype="float32") = R.add(lv11, lv10)
-                lv13: R.Tensor((1, 3, 10, 10), dtype="float32") = R.reshape(lv12, (1, 3, 10, 10))
-                gv: R.Tensor((1, 3, 10, 10), dtype="float32") = lv13
+                gv: R.Tensor((1, 3, 10, 10), dtype="float32") = lv
                 R.output(gv)
             return gv
 
diff --git a/tests/python/relax/test_op_nn.py b/tests/python/relax/test_op_nn.py
diff --git a/tests/python/relax/test_transform_legalize_ops_nn.py b/tests/python/relax/test_transform_legalize_ops_nn.py
diff --git a/tests/python/relax/test_tvmscript_parser_op_nn.py b/tests/python/relax/test_tvmscript_parser_op_nn.py