[MKLDNN] fix uint8 batch norm memory misuse (apache#16034)

ElaineBao · pengzhao-intel · commit 9173dad94753 · 2019-08-30T13:57:35.000+08:00
* fix uint8 bn reorder memomry

* update resnet152-v2 int8 acc
diff --git a/example/quantization/README.md b/example/quantization/README.md
@@ -97,7 +97,7 @@ The following models have been tested on Linux systems. Accuracy is collected on
 |[MobileNet 1.0](#5)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.23%/90.64%|72.06%/90.53%|
 |[MobileNetV2 1.0](#6)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|70.27%/89.62%|69.82%/89.35%|
 |[Inception V3](#7)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|77.76%/93.83% |78.05%/93.91% |
-|[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.65%/93.07%|76.19%/92.88%|
+|[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.65%/93.07%|76.25%/92.89%|
 |[Inception-BN](#9)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.28%/90.63%|72.02%/90.53%|
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8357 mAP  |
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | COCO2014  | 0.2552 mAP  | 0.253 mAP  |
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -202,7 +202,8 @@ def save_params(fname, arg_params, aux_params, logger=None):
         if args.model == 'imagenet1k-resnet-152':
             rgb_mean = '0,0,0'
             rgb_std = '1,1,1'
-            excluded_sym_names += ['flatten0']
+            # stage1_unit1_bn1 & stage4_unit1_bn1 is excluded for the sake of accuracy
+            excluded_sym_names += ['flatten0', 'stage1_unit1_bn1', 'stage4_unit1_bn1']
             if exclude_first_conv:
                 excluded_sym_names += ['conv0']
         elif args.model == 'imagenet1k-inception-bn':
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -184,7 +184,7 @@ class MKLDNNBNForward {
 
 template<typename DType>
 static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
-                                     const OpContext &ctx, const NDArray &in_data,
+                                     const OpContext &ctx, const mkldnn::memory *data_mem,
                                      unsigned flags) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
@@ -194,18 +194,25 @@ static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
   MKLDNNBNSignature key(param);
   key.AddSign(ctx.is_train);
   key.AddSign(param.use_global_stats);
-  key.AddSign(in_data);
+  key.AddSign(*data_mem);
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    auto fwd_pd = _GetFwd(*in_data.GetMKLDNNData(), ctx.is_train,
+    auto fwd_pd = _GetFwd(*data_mem, ctx.is_train,
                           (DType) param.eps, flags);
     MKLDNNBNForward fwd(fwd_pd, ctx.is_train && !param.use_global_stats);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
+template<typename DType>
+static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
+                                     const OpContext &ctx, const NDArray &in_data,
+                                     unsigned flags) {
+  return GetBNForward<DType>(param, ctx, in_data.GetMKLDNNData(), flags);
+}
+
 template <typename DType>
 void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam &param,
                             const std::vector<NDArray>   &in_data,
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
@@ -80,7 +80,7 @@ static void MKLDNNQuantizedBatchNormForward(const nnvm::NodeAttrs &attrs, const
   const float max_abs_output = std::max(std::abs(*min_output_ptr), std::abs(*max_output_ptr));
 
   unsigned flags = mkldnn::use_global_stats | mkldnn::use_scale_shift;
-  auto &fwd = GetBNForward<float>(param, ctx, data, flags);
+  auto &fwd = GetBNForward<float>(param, ctx, data_mem, flags);
   const mkldnn::memory &weight_mem = fwd.GetWeight();
   CHECK_EQ(weight_mem.get_primitive_desc().get_size(), channel_count * sizeof(float) * 2);
   float *weight_buf = reinterpret_cast<float *>(weight_mem.get_data_handle());