[Relay] [Quantization] WIP - Protoyping the quantized convolution op

anijain2305 · anijain2305 · commit f365ea71dc45 · 2019-06-27T19:18:44.000Z
Goal - Act as medium of discussion for pull request #2351 Features - New quantized conv2D op in Relay - Python API interface to instantiate the Relay op - Infer Type implemented - Lowering of quantized_conv op to low-level Relay ops Discussion points - Does the namespace look correct? - Relay op is called 'relay.op.nn._quantize.quantized_conv2d' - Idea is that any op under '_quantize' namespace will go through rewrite. - Should we reuse Conv2DRel and Conv2DAttrs - Tried protoyping. Found it hard to derive from Conv2DAttr struct - Infer Type has a param field. This need to come from the right datatype. Missing implememtation - Lowering of quantized conv into conv+cast is incomplete. - Will work on it async. This is orthogonal to the discussion.
diff --git a/include/tvm/relay/attrs/nn_quantize.h b/include/tvm/relay/attrs/nn_quantize.h
@@ -52,6 +52,7 @@ struct QuantizedConv2DAttrs : public tvm::AttrsNode<QuantizedConv2DAttrs> {
   double input_scale;
   double kernel_scale;
   double output_scale;
+  bool use_integer_computation_for_scale_handling;
 
   TVM_DECLARE_ATTRS(QuantizedConv2DAttrs, "relay.attrs.QuantizedConv2DAttrs") {
     TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
@@ -106,6 +107,8 @@ struct QuantizedConv2DAttrs : public tvm::AttrsNode<QuantizedConv2DAttrs> {
         .describe("The scale of the kernel tensor.");
     TVM_ATTR_FIELD(output_scale)
         .describe("The scale of the output tensor.");
+    TVM_ATTR_FIELD(use_integer_computation_for_scale_handling).set_default(false)
+      .describe("When true, the integer computation is used to handle output scale");
 
 
   }
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
@@ -31,12 +31,62 @@
 namespace tvm {
 namespace relay {
 
+Expr ConvolveQuantizedTensors(const Expr& quantized_data,
+        const Expr& quantized_kernel, const QuantizedConv2DAttrs*& param) {
+  // TODO (janimesh) - Who should decide the accumulation dtype?
+  if (param->input_zero_point == 0 && param->kernel_zero_point == 0) {
+    Expr int8_conv = Conv2D(quantized_data,
+                         quantized_kernel,
+                         param->strides,
+                         param->padding,
+                         param->dilation,
+                         param->groups,
+                         param->channels,
+                         param->kernel_size,
+                         param->data_layout,
+                         param->kernel_layout,
+                         param->out_layout,
+                         Int(32));
+    return int8_conv;
+  }
+  LOG(FATAL) << "Only symmetric quantization supported";
+  return Expr(); // to hide the warning.
+}
+
+Expr ScaleHandling(const Expr& convolved_tensor,
+        const QuantizedConv2DAttrs*& param) {
+  // The scale handling can be done in many ways.
+  // 1) Floating point handling
+  //    Here we can multiply the scale to the convolved_tensor, round to nearest
+  //    integer and then cast back to int32.
+  // 2) Integer only scale handling
+  //    Here, the computation is converted to a fixed point computation by
+  //    computing output multiplier and shift. This is useful, if the target
+  //    device does not support/have very expensive floating point computations.
+
+  if (param->use_integer_computation_for_scale_handling == false) {
+    double multiplier = (param->input_scale * param->kernel_scale) /
+        param->output_scale;
+    auto scalar_multiplier = MakeConstantScalar(Float(32), multiplier);
+    auto casted_convolved_tensor = Cast(convolved_tensor, Float(32));
+    auto scaled_fp32_tensor = Multiply(casted_convolved_tensor, scalar_multiplier);
+    auto scaled_rounded_fp32_tensor = Round(scaled_fp32_tensor);
+    auto scaled_tensor = Cast(scaled_rounded_fp32_tensor, Int(32));
+    return scaled_tensor;
+  }
+  LOG(FATAL) << "Only floating point scale handling is supported for now.";
+  return Expr(); // to hide the warning.
+}
+
+Expr ReQuantize(const Expr& scaled_output,
+        const QuantizedConv2DAttrs*& param) {
+  Expr requantized_output = Cast(scaled_output, param->out_dtype);
+  return requantized_output;
+}
+
 Expr QuantizedConv2DForwardRewrite(const Call& ref_call,
                                    const Array<Expr>& new_args,
                                    const NodeRef& ctx) {
-  // TODO(janimesh) - This is not the right calculation. This only serves as a
-  // prototype to discuss the flow of lowering of quantization ops and
-  // namespaces.
   CHECK_EQ(new_args.size(), 2);
   Expr quantized_data = new_args[0];
   Expr quantized_kernel = new_args[1];
@@ -62,6 +112,68 @@ Expr QuantizedConv2DForwardRewrite(const Call& ref_call,
   // TODO(janimesh) - Look at the literature and use the right scale
   // calculations.
   return int8_conv;
+
+  // Check for current quantization support.
+  CHECK_EQ(param->input_zero_point, 0) 
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+  CHECK_EQ(param->kernel_zero_point, 0)
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+  CHECK_EQ(param->output_zero_point, 0)
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+  CHECK_EQ(param->use_integer_computation_for_scale_handling, false)
+      << "Currently floating point computation is used for scale handling. "
+      << "Please switch to False if HW supports floating point arithmetic";
+
+  // Lowering of the quantized_convolution.
+  //
+  // For FP32, the conv output is
+  //     C = conv(A, W)
+  // or, C(n, oc, oh, ow) = A(n, ic, oh + r, ow + s) * W(oc, ic, r, s)
+  // where, ic, r, s are reduce axis.
+  // 
+  // For quantized convolution, each tensor is represented in quantized format
+  //    A = scale_a x (QA - zp_A)
+  // where QA is quantized tensor, scale_a and zp_A are quantizations params.
+  //
+  // For symmetric quantization, the zp_* for all tensors is 0.
+  // So, the quantized_convolution becomes 
+  //
+  //    scale_c * QC(n, oc, oh, ow) = 
+  //        scale_a * QA(n, ic, oh + r, ow + s) x
+  //        scale_w * QW(oc, ic, r, s)
+  // 
+  // So, to get the quantized tensor C, the computation is
+  //
+  //    QC(n, oc, oh, ow) = (scale_a * scale_w)/scale_c x
+  //        QA(n, ic, oh + r, ow + s) x QW(oc, ic, r, s)
+  //
+  // or,     
+  //    QC = K * conv(QA, QB)
+  //
+  // For asymmetric computation, we can perform similar unrolling. We can find
+  // more details at
+  // https://discuss.tvm.ai/t/tf-lite-quantized-conv2d-operator-conversion/2651/8?u=janimesh
+
+  // The above computation is arranged in following functions
+  //    1) ConvolveQuantizedTensors 
+  //        a) For symmetric, conv(QA, QB).
+  //        b) For asymmetric, it involves 4 terms.
+  //    2) ScaleHandling
+  //        a) Takes convolved output and scales it.
+  //        b) Can support both float and integer computation.
+  //    3) Requantize
+  //        a) Converts the intermediate dtype back to int8.
+  Expr convolved_tensor = ConvolveQuantizedTensors(quantized_data,
+                                                   quantized_kernel,
+                                                   param);
+  Expr scaled_output = ScaleHandling(convolved_tensor, param);
+  Expr requantized_output = ReQuantize(scaled_output, param);
+  // TODO(janimesh) - Look at the literature and use the right scale
+  // calculations.
+  return requantized_output;
 }
 
 RELAY_REGISTER_OP("nn_quantized.quantized_conv2d")