[Relay][Quantization] Speed-aware quantization scheme improvement

vinx13 · vinx13 · commit 4996286c7f5e · 2019-03-04T16:11:01.000+08:00
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
@@ -9,7 +9,7 @@
 from .. import nd as _nd, target as _target, autotvm
 from ..contrib import graph_runtime as _graph_rt
 from . import ir_pass
-from . import expr
+from . import expr as _expr
 from .backend import interpreter as _interpreter
 from .backend import graph_runtime_codegen as _graph_gen
 
@@ -22,6 +22,7 @@
     "FoldScaleAxis": 3,
     "AlterOpLayout": 3,
     "CanonicalizeOps": 3,
+    "EliminateCommonSubexpr": 3,
 }
 
 
@@ -126,8 +127,8 @@ def _bind_params_by_name(func, params):
         arg = name_dict[k]
         if arg is None:
             raise ValueError("Multiple args in the function have name %s" % k)
-        bind_dict[arg] = expr.const(v)
-    return expr.bind(func, bind_dict)
+        bind_dict[arg] = _expr.const(v)
+    return _expr.bind(func, bind_dict)
 
 
 def optimize(func, target=None, params=None):
@@ -162,6 +163,16 @@ def optimize(func, target=None, params=None):
         func = ir_pass.infer_type(func)
         func = ir_pass.simplify_inference(func)
 
+    if cfg.pass_enabled("EliminateCommonSubexpr"):
+        def fskip(expr):
+            if isinstance(expr, _expr.Call) and expr.op.name == 'cast' and \
+               expr.attrs.dtype == 'int32':
+                return True
+            return False
+
+        func = ir_pass.infer_type(func)
+        func = ir_pass.eliminate_common_subexpr(func, fskip)
+
     if cfg.pass_enabled("CombineParallelConv2D"):
         func = ir_pass.infer_type(func)
         func = ir_pass.combine_parallel_conv2d(func)
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
@@ -192,6 +192,8 @@ def add_rewrite(ref_call, new_args, ctx):
         else:
             # quantize rhs to INPUT field if it is not Constant
             rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
+    if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION:
+        rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
 
     expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
@@ -124,7 +124,7 @@ TVM_REGISTER_API("relay._quantize.annotate")
       }
       return e;
     };
-  return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, nullptr);
+  return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, fmulti_ref);
 });
 
 
@@ -329,9 +329,11 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 
 /* \brief Unify the dom scale of arguments */
-Array<Expr> UnifyDTypeScale(const Array<Expr>& args,
+Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
+                            const Array<Expr>& args,
                             DataType* dtype_ptr,
                             Expr* scale_ptr) {
+  static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
   std::vector<const QRealizeIntExprNode*> nptrs;
@@ -344,10 +346,17 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& args,
   }
 
   // unify the data type
+  CHECK_EQ(ref_args.size(), args.size());
   DataType dtype = cfg->dtype_activation;
   for (size_t i = 0; i < ret.size(); ++i) {
+    auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
       ret.Set(i, Cast(ret[i], dtype));
+    } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
+               ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
+      auto new_arg = Cast(ret[i], cfg->dtype_input);
+      new_arg = StopFusion(new_arg);
+      ret.Set(i, Cast(new_arg, dtype));
     }
   }
 
@@ -371,7 +380,7 @@ Expr AddRealize(const Call& ref_call,
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(new_args, &dtype, &dom_scale);
+    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   }
@@ -387,15 +396,19 @@ Expr ConcatenateRealize(const Call& ref_call,
                         const Array<Expr>& new_args,
                         const NodeRef& ctx) {
   CHECK_EQ(new_args.size(), 1);
+  CHECK_EQ(ref_call->args.size(), 1);
 
   const auto* tuple = new_args[0].as<TupleNode>();
+  const auto* ref_tuple = ref_call->args[0].as<TupleNode>();
   CHECK(tuple);
+  CHECK(ref_tuple);
   const Array<Expr>& arr = tuple->fields;
+  const Array<Expr>& ref_arr = ref_tuple->fields;
 
   if (arr[0].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(arr, &dtype, &dom_scale);
+    Array<Expr> ret_args = UnifyDTypeScale(ref_arr, arr, &dtype, &dom_scale);
     Expr ret = ForwardOp(ref_call, {TupleNode::make(ret_args)});
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   } else {