From eea0097dcb52d5f06a2b44afb482cab0d706d18e Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Thu, 9 Mar 2017 16:29:55 +0800
Subject: [PATCH 01/20] NormalizeLayer for SSD

---
 paddle/gserver/layers/NormalizeLayer.cpp      | 182 ++++++++++++++++++
 paddle/gserver/layers/PriorBox.cpp            |  30 +--
 paddle/gserver/tests/test_LayerGrad.cpp       |  14 ++
 python/paddle/trainer/config_parser.py        |  10 +
 .../paddle/trainer_config_helpers/layers.py   |  31 +++
 5 files changed, 254 insertions(+), 13 deletions(-)
 create mode 100644 paddle/gserver/layers/NormalizeLayer.cpp
diff --git a/paddle/gserver/layers/NormalizeLayer.cpp b/paddle/gserver/layers/NormalizeLayer.cpp
new file mode 100644
index 0000000000..22df8adb4e
--- /dev/null
+++ b/paddle/gserver/layers/NormalizeLayer.cpp
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * This layer applys normalize across the channels of each sample to a
+ * conv layer's output and scale the output by a group of trainable factors
+ * which dimensions equal to the channel's number.
+ * - Input: One and only one input layer are accepted. The input layer must be
+ *        be a data output layer.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class NormalizeLayer : public Layer {
+public:
+  explicit NormalizeLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+
+protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
+REGISTER_LAYER(normalize, NormalizeLayer);
+
+bool NormalizeLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  channels_ = config_.num_filters();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
+void NormalizeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto in = getInput(0);
+  MatrixPtr inV = getInputValue(0);
+
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = inV->getWidth();
+  CHECK_EQ(getSize(), dataDim);
+
+  reserveOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+  size_t spatialDim = dataDim / channels_;
+
+  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
+  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
+  normBuffer_->zeroMem();
+  spatialBuffer_->zeroMem();
+  sampleBuffer_->zeroMem();
+  dataBuffer_->zeroMem();
+  // add eps to avoid overflow
+  normBuffer_->addScalar(*normBuffer_, 1e-6);
+  channelBuffer_->resetOne();
+  inV->square2(*dataBuffer_);
+  for (size_t i = 0; i < batchSize; i++) {
+    spatialBuffer_->zeroMem();
+    MatrixPtr inTmp = Matrix::create(
+        inV->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
+    MatrixPtr dataTmp = Matrix::create(dataBuffer_->getData() + i * dataDim,
+                                       channels_,
+                                       spatialDim,
+                                       false,
+                                       useGpu_);
+    MatrixPtr outTmp = Matrix::create(
+        outV->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
+    MatrixPtr normTmp = Matrix::create(
+        normBuffer_->getData() + i * spatialDim, 1, spatialDim, false, useGpu_);
+    // compute norm.
+    spatialBuffer_->sumCols(*dataTmp, 1, 1);
+    spatialBuffer_->sqrt2(*spatialBuffer_);
+    normTmp->copyFrom(*spatialBuffer_);
+    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
+    sampleBuffer_->dotDiv(*inTmp, *sampleBuffer_);
+    outTmp->copyFrom(*sampleBuffer_);
+
+    // scale the layer.
+    spatialBuffer_->resetOne();
+    sampleBuffer_->mul(*scale_->getW(), *spatialBuffer_, 1., 0.);
+    outTmp->dotMul(*outTmp, *sampleBuffer_);
+  }
+}
+
+void NormalizeLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr outV = getOutputValue();
+
+  auto in = getInput(0);
+  size_t batchSize = inG->getHeight();
+  size_t dataDim = inG->getWidth();
+  size_t spatialDim = dataDim / channels_;
+
+  bool syncFlag = hl_get_sync_flag();
+  dataBuffer_->dotMul(*outG, *outV);
+  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  scaleDiff_->zeroMem();
+  for (size_t i = 0; i < batchSize; i++) {
+    spatialBuffer_->zeroMem();
+    channelBuffer_->zeroMem();
+    // propagate to param.
+    MatrixPtr dataBufferTmp =
+        Matrix::create(dataBuffer_->getData() + i * dataDim,
+                       channels_,
+                       spatialDim,
+                       false,
+                       useGpu_);
+    const MatrixPtr inValueTmp = Matrix::create(
+        inV->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
+    const MatrixPtr outGradTmp = Matrix::create(
+        outG->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
+    MatrixPtr inGradTmp = Matrix::create(
+        inG->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
+    const MatrixPtr normTmp = Matrix::create(
+        normBuffer_->getData() + i * spatialDim, 1, spatialDim, false, useGpu_);
+    channelBuffer_->sumRows(*dataBufferTmp, 1, 1);
+    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
+    // store a / scale[i] in scaleDiff_ temporary
+    scaleDiff_->add(*channelBuffer_, 1.);
+
+    sampleBuffer_->dotMul(*inValueTmp, *outGradTmp);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    // scale the grad
+    channelBuffer_->resetOne();
+    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
+
+    inGradTmp->dotMul(*inValueTmp, *sampleBuffer_);
+    // divide by square of norm
+    spatialBuffer_->dotMul(*normTmp, *normTmp);
+    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
+    inGradTmp->dotDiv(*inGradTmp, *sampleBuffer_);
+    // subtract
+    inGradTmp->add(*outGradTmp, -1, 1);
+    // divide by norm
+    sampleBuffer_->mul(*channelBuffer_, *normTmp, 1., 0.);
+    inGradTmp->dotDiv(*inGradTmp, *sampleBuffer_);
+    // scale the diff
+    spatialBuffer_->resetOne();
+    sampleBuffer_->mul(*scale_->getW(), *spatialBuffer_, 1., 0.);
+    inGradTmp->dotMul(*inGradTmp, *sampleBuffer_);
+  }
+  // updata scale
+  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  hl_set_sync_flag(false);
+  hl_set_sync_flag(syncFlag);
+  scale_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index bcf5e912a5..331bc7672e 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
  * @brief A layer for generating priorbox locations and variances.
  * - Input: Two and only two input layer are accepted. The input layer must be
- *        be a data output layer and a convolution output layer.
+ *          be a data output layer and a convolution output layer.
  * - Output: The priorbox locations and variances of the input data.
  * Reference:
  *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
@@ -45,27 +45,32 @@ protected:
   MatrixPtr buffer_;
 };
 
+REGISTER_LAYER(priorbox, PriorBoxLayer);
+
 bool PriorBoxLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
   Layer::init(layerMap, parameterMap);
   auto pbConf = config_.inputs(0).priorbox_conf();
+  std::vector<real> tmp;
+  aspectRatio_.push_back(1.);
   std::copy(pbConf.min_size().begin(),
             pbConf.min_size().end(),
             std::back_inserter(minSize_));
   std::copy(pbConf.max_size().begin(),
             pbConf.max_size().end(),
             std::back_inserter(maxSize_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(aspectRatio_));
   std::copy(pbConf.variance().begin(),
             pbConf.variance().end(),
             std::back_inserter(variance_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(tmp));
   // flip
-  int inputRatioLength = aspectRatio_.size();
-  for (int index = 0; index < inputRatioLength; index++)
-    aspectRatio_.push_back(1 / aspectRatio_[index]);
-  aspectRatio_.push_back(1.);
+  int inputRatioLength = tmp.size();
+  for (int index = 0; index < inputRatioLength; index++) {
+    aspectRatio_.push_back(tmp[index]);
+    aspectRatio_.push_back(1 / tmp[index]);
+  }
   numPriors_ = aspectRatio_.size();
   if (maxSize_.size() > 0) numPriors_++;
   return true;
@@ -94,12 +99,12 @@ void PriorBoxLayer::forward(PassType passType) {
     for (int w = 0; w < layerWidth; ++w) {
       real centerX = (w + 0.5) * stepW;
       real centerY = (h + 0.5) * stepH;
-      int minSize = 0;
+      real minSize = 0;
       for (size_t s = 0; s < minSize_.size(); s++) {
         // first prior.
         minSize = minSize_[s];
-        int boxWidth = minSize;
-        int boxHeight = minSize;
+        real boxWidth = minSize;
+        real boxHeight = minSize;
         // xmin, ymin, xmax, ymax.
         tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
         tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -112,7 +117,7 @@ void PriorBoxLayer::forward(PassType passType) {
           CHECK_EQ(minSize_.size(), maxSize_.size());
           // second prior.
           for (size_t s = 0; s < maxSize_.size(); s++) {
-            int maxSize = maxSize_[s];
+            real maxSize = maxSize_[s];
             boxWidth = boxHeight = sqrt(minSize * maxSize);
             tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
             tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -145,6 +150,5 @@ void PriorBoxLayer::forward(PassType passType) {
   MatrixPtr outV = getOutputValue();
   outV->copyFrom(buffer_->data_, dim * 2);
 }
-REGISTER_LAYER(priorbox, PriorBoxLayer);
 
 }  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 14d9db5247..a7d3eaeaf9 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1623,6 +1623,20 @@ TEST(Layer, PadLayer) {
   }
 }
 
+TEST(Layer, NormalizeLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("normalize");
+  config.layerConfig.set_size(100);
+  config.layerConfig.set_num_filters(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "normalize", 10, false, useGpu, false, 5);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index da937152ee..c520392195 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1619,6 +1619,16 @@ class PriorBoxLayer(LayerBase):
         self.config.size = size
 
 
+@config_layer('normalize')
+class NormalizeLayer(LayerBase):
+    def __init__(self, name, inputs, size, num_filters, **xargs):
+        super(NormalizeLayer, self).__init__(name, 'normalize', 0, inputs,
+                                             **xargs)
+        self.config.size = size
+        self.config.num_filters = num_filters
+        self.create_input_parameter(0, num_filters, [num_filters, 1])
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self, name, size, height=None, width=None, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b94f8f9a78..1541b532d9 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -111,6 +111,7 @@ __all__ = [
     'out_prod_layer',
     'print_layer',
     'priorbox_layer',
+    'normalize_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -184,6 +185,7 @@ class LayerType(object):
 
     PRINT_LAYER = "print"
     PRIORBOX_LAYER = "priorbox"
+    NORMALIZE_LAYER = "normalize"
 
     CTC_LAYER = "ctc"
     WARP_CTC_LAYER = "warp_ctc"
@@ -998,6 +1000,35 @@ def priorbox_layer(input,
         size=size)
 
 
+@wrap_name_default("normalize")
+def normalize_layer(input, name=None, param_attr=None):
+    """
+    Normalize a layer's output. This layer is necessary for ssd.
+    This layer applys normalize across the channels of each sample to
+    a conv layer's output and scale the output by a group of trainable
+    factors which dimensions equal to the channel's number.
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
+    :return: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.NORMALIZE_LAYER,
+        inputs=[Input(input.name, **param_attr.attr)],
+        size=input.size,
+        num_filters=input.num_filters)
+    return LayerOutput(
+        name,
+        LayerType.NORMALIZE_LAYER,
+        parents=input,
+        num_filters=input.num_filters,
+        size=input.size)
+
+
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())

From eb43d93a58afca13988c55eb820e67bec93160c7 Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Mon, 20 Mar 2017 20:17:53 +0800
Subject: [PATCH 02/20] Change Normalize layer to CrossChannelNorm layer

---
 ...izeLayer.cpp => CrossChannelNormLayer.cpp} | 81 +++----------------
 paddle/gserver/layers/NormLayer.cpp           | 12 +++
 paddle/gserver/layers/NormLayer.h             | 31 +++++++
 paddle/gserver/tests/test_LayerGrad.cpp       | 17 ++--
 paddle/math/BaseMatrix.cu                     | 18 +++++
 paddle/math/BaseMatrix.h                      |  3 +
 paddle/math/tests/test_BaseMatrix.cpp         |  2 +
 python/paddle/trainer/config_parser.py        | 21 ++---
 .../paddle/trainer_config_helpers/layers.py   | 26 +++---
 9 files changed, 115 insertions(+), 96 deletions(-)
 rename paddle/gserver/layers/{NormalizeLayer.cpp => CrossChannelNormLayer.cpp} (64%)

diff --git a/paddle/gserver/layers/NormalizeLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
similarity index 64%
rename from paddle/gserver/layers/NormalizeLayer.cpp
rename to paddle/gserver/layers/CrossChannelNormLayer.cpp
index 22df8adb4e..ced7199991 100644
--- a/paddle/gserver/layers/NormalizeLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -13,53 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Layer.h"
+#include "NormLayer.h"
 #include "paddle/math/BaseMatrix.h"
 #include "paddle/math/Matrix.h"
 
 namespace paddle {
-/**
- * This layer applys normalize across the channels of each sample to a
- * conv layer's output and scale the output by a group of trainable factors
- * which dimensions equal to the channel's number.
- * - Input: One and only one input layer are accepted. The input layer must be
- *        be a data output layer.
- * - Output: The normalized data of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
 
-class NormalizeLayer : public Layer {
-public:
-  explicit NormalizeLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-
-protected:
-  size_t channels_;
-  std::unique_ptr<Weight> scale_;
-  MatrixPtr scaleDiff_;
-  MatrixPtr normBuffer_;
-  MatrixPtr dataBuffer_;
-  MatrixPtr channelBuffer_;
-  MatrixPtr spatialBuffer_;
-  MatrixPtr sampleBuffer_;
-};
-
-REGISTER_LAYER(normalize, NormalizeLayer);
-
-bool NormalizeLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  channels_ = config_.num_filters();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
-void NormalizeLayer::forward(PassType passType) {
+void CrossChannelNormLayer::forward(PassType passType) {
   Layer::forward(passType);
   auto in = getInput(0);
   MatrixPtr inV = getInputValue(0);
@@ -74,16 +34,12 @@ void NormalizeLayer::forward(PassType passType) {
 
   Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
   Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
-  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
   Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
   normBuffer_->zeroMem();
   spatialBuffer_->zeroMem();
-  sampleBuffer_->zeroMem();
   dataBuffer_->zeroMem();
   // add eps to avoid overflow
   normBuffer_->addScalar(*normBuffer_, 1e-6);
-  channelBuffer_->resetOne();
   inV->square2(*dataBuffer_);
   for (size_t i = 0; i < batchSize; i++) {
     spatialBuffer_->zeroMem();
@@ -102,18 +58,14 @@ void NormalizeLayer::forward(PassType passType) {
     spatialBuffer_->sumCols(*dataTmp, 1, 1);
     spatialBuffer_->sqrt2(*spatialBuffer_);
     normTmp->copyFrom(*spatialBuffer_);
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
-    sampleBuffer_->dotDiv(*inTmp, *sampleBuffer_);
-    outTmp->copyFrom(*sampleBuffer_);
-
+    outTmp->copyFrom(*inTmp);
+    outTmp->divRowVector(*spatialBuffer_);
     // scale the layer.
-    spatialBuffer_->resetOne();
-    sampleBuffer_->mul(*scale_->getW(), *spatialBuffer_, 1., 0.);
-    outTmp->dotMul(*outTmp, *sampleBuffer_);
+    outTmp->mulColVector(*scale_->getW());
   }
 }
 
-void NormalizeLayer::backward(const UpdateCallback& callback) {
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inG = getInputGrad(0);
   MatrixPtr inV = getInputValue(0);
   MatrixPtr outG = getOutputGrad();
@@ -124,9 +76,10 @@ void NormalizeLayer::backward(const UpdateCallback& callback) {
   size_t dataDim = inG->getWidth();
   size_t spatialDim = dataDim / channels_;
 
-  bool syncFlag = hl_get_sync_flag();
   dataBuffer_->dotMul(*outG, *outV);
   Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
   scaleDiff_->zeroMem();
   for (size_t i = 0; i < batchSize; i++) {
     spatialBuffer_->zeroMem();
@@ -154,28 +107,20 @@ void NormalizeLayer::backward(const UpdateCallback& callback) {
     sampleBuffer_->dotMul(*inValueTmp, *outGradTmp);
     spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
     // scale the grad
-    channelBuffer_->resetOne();
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
-
-    inGradTmp->dotMul(*inValueTmp, *sampleBuffer_);
+    inGradTmp->copyFrom(*inValueTmp);
+    inGradTmp->mulRowVector(*spatialBuffer_);
     // divide by square of norm
     spatialBuffer_->dotMul(*normTmp, *normTmp);
-    sampleBuffer_->mul(*channelBuffer_, *spatialBuffer_, 1., 0.);
-    inGradTmp->dotDiv(*inGradTmp, *sampleBuffer_);
+    inGradTmp->divRowVector(*spatialBuffer_);
     // subtract
     inGradTmp->add(*outGradTmp, -1, 1);
     // divide by norm
-    sampleBuffer_->mul(*channelBuffer_, *normTmp, 1., 0.);
-    inGradTmp->dotDiv(*inGradTmp, *sampleBuffer_);
+    inGradTmp->divRowVector(*normTmp);
     // scale the diff
-    spatialBuffer_->resetOne();
-    sampleBuffer_->mul(*scale_->getW(), *spatialBuffer_, 1., 0.);
-    inGradTmp->dotMul(*inGradTmp, *sampleBuffer_);
+    inGradTmp->mulColVector(*scale_->getW());
   }
   // updata scale
   if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
-  hl_set_sync_flag(false);
-  hl_set_sync_flag(syncFlag);
   scale_->getParameterPtr()->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index 3db0af2515..e094078bfe 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -26,6 +26,8 @@ Layer* NormLayer::create(const LayerConfig& config) {
     return new ResponseNormLayer(config);
   } else if (norm == "cmrnorm-projection") {
     return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
   } else {
     LOG(FATAL) << "Unknown norm type: " << norm;
     return nullptr;
@@ -54,4 +56,14 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   return true;
 }
 
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index e77faaa322..59ba226dfe 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -65,4 +65,35 @@ public:
   }
 };
 
+/**
+ * This layer applys normalize across the channels of each sample to a
+ * conv layer's output and scale the output by a group of trainable factors
+ * which dimensions equal to the channel's number.
+ * - Input: One and only one input layer are accepted. The input layer must be
+ *        be a data output layer.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+
+protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
 }  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index a7d3eaeaf9..7afaf87189 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1623,17 +1623,22 @@ TEST(Layer, PadLayer) {
   }
 }
 
-TEST(Layer, NormalizeLayer) {
+TEST(Layer, CrossChannelNormLayer) {
   TestConfig config;
-  config.layerConfig.set_type("normalize");
+  config.layerConfig.set_type("norm");
   config.layerConfig.set_size(100);
-  config.layerConfig.set_num_filters(10);
-
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
-  config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "normalize", 10, false, useGpu, false, 5);
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
   }
 }
 
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 0a0d92d1ae..de48b6fac9 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1453,6 +1453,24 @@ void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
               true_type() /* bAsRowVector */, false_type());
 }
 
+template<class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
+template<class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 8691c87ac3..6ed48c8d88 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -545,6 +545,9 @@ public:
   void mulRowVector(BaseMatrixT& b);
   void divRowVector(BaseMatrixT& b);
 
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
+
   void addP2P(BaseMatrixT& b);
 
   /**
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 21918b86e1..22ce39701f 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -110,6 +110,8 @@ TEST(BaseMatrix, BaseMatrix) {
       compare(&BaseMatrix::addRowVector);
       compare(&BaseMatrix::mulRowVector);
       compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
       compare(&BaseMatrix::addP2P);
       compare(&BaseMatrix::invSqrt);
     }
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index c520392195..3e6a73dcf8 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1156,9 +1156,11 @@ def parse_image(image, input_layer_name, image_conf):
 
 def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.norm_type = norm.norm_type
-    config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
-                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']" %
-                  norm.norm_type)
+    config_assert(
+        norm.norm_type in
+        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
+        % norm.norm_type)
     norm_conf.channels = norm.channels
     norm_conf.size = norm.size
     norm_conf.scale = norm.scale
@@ -1619,16 +1621,6 @@ class PriorBoxLayer(LayerBase):
         self.config.size = size
 
 
-@config_layer('normalize')
-class NormalizeLayer(LayerBase):
-    def __init__(self, name, inputs, size, num_filters, **xargs):
-        super(NormalizeLayer, self).__init__(name, 'normalize', 0, inputs,
-                                             **xargs)
-        self.config.size = size
-        self.config.num_filters = num_filters
-        self.create_input_parameter(0, num_filters, [num_filters, 1])
-
-
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self, name, size, height=None, width=None, device=None):
@@ -1831,6 +1823,9 @@ class NormLayer(LayerBase):
                        norm_conf)
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
+            if norm_conf.norm_type == "cross-channel-norm":
+                self.create_input_parameter(0, norm_conf.channels,
+                                            [norm_conf.channels, 1])
 
 
 @config_layer('pool')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 1541b532d9..b6a9426476 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -111,7 +111,7 @@ __all__ = [
     'out_prod_layer',
     'print_layer',
     'priorbox_layer',
-    'normalize_layer',
+    'cross_channel_norm_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -185,7 +185,6 @@ class LayerType(object):
 
     PRINT_LAYER = "print"
     PRIORBOX_LAYER = "priorbox"
-    NORMALIZE_LAYER = "normalize"
 
     CTC_LAYER = "ctc"
     WARP_CTC_LAYER = "warp_ctc"
@@ -1000,8 +999,8 @@ def priorbox_layer(input,
         size=size)
 
 
-@wrap_name_default("normalize")
-def normalize_layer(input, name=None, param_attr=None):
+@wrap_name_default("cross_channel_norm")
+def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
     Normalize a layer's output. This layer is necessary for ssd.
     This layer applys normalize across the channels of each sample to
@@ -1017,13 +1016,22 @@ def normalize_layer(input, name=None, param_attr=None):
     """
     Layer(
         name=name,
-        type=LayerType.NORMALIZE_LAYER,
-        inputs=[Input(input.name, **param_attr.attr)],
-        size=input.size,
-        num_filters=input.num_filters)
+        type=LayerType.NORM_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                norm=Norm(
+                    norm_type="cross-channel-norm",
+                    channels=input.num_filters,
+                    size=input.size,
+                    scale=0,
+                    pow=0,
+                    blocked=0),
+                **param_attr.attr)
+        ])
     return LayerOutput(
         name,
-        LayerType.NORMALIZE_LAYER,
+        LayerType.NORM_LAYER,
         parents=input,
         num_filters=input.num_filters,
         size=input.size)

From c06d8d2129f8bfaaefbc783ca2ae019395bbcdb3 Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Mon, 20 Mar 2017 20:38:50 +0800
Subject: [PATCH 03/20] Assert cross_channel_norm's input filters

---
 python/paddle/trainer_config_helpers/layers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index ed93fb72e0..cac71539f5 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1023,6 +1023,7 @@ def cross_channel_norm_layer(input, name=None, param_attr=None):
     :type param_attr: ParameterAttribute
     :return: LayerOutput
     """
+    assert input.num_filters is not None
     Layer(
         name=name,
         type=LayerType.NORM_LAYER,

From 549cf641cd9d063b0fa484d527e0a798e4d008d7 Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Mon, 20 Mar 2017 21:05:34 +0800
Subject: [PATCH 04/20] format the code

---
 paddle/gserver/tests/test_LayerGrad.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 781058ae3d..875fd39b40 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1602,7 +1602,6 @@ TEST(Layer, PadLayer) {
   }
 }
 
-
 TEST(Layer, CrossChannelNormLayer) {
   TestConfig config;
   config.layerConfig.set_type("norm");

From 4761300b2a13b9d33f1e742bfac23ac0b5ea2a9b Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 22 Mar 2017 15:05:17 +0800
Subject: [PATCH 05/20] update docker build and install doc

---
 .../build_and_install/docker_install_cn.rst   | 130 +++++++++-------
 .../build_and_install/docker_install_en.rst   | 145 ++++++++++--------
 2 files changed, 153 insertions(+), 122 deletions(-)

diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index af889ec9d1..4150b18b2e 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -4,119 +4,131 @@ PaddlePaddle的Docker容器使用方式
 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
 
-纯CPU和GPU的docker镜像使用说明
+PaddlePaddle发布的docker镜像使用说明
 ------------------------------
 
-对于每一个PaddlePaddle版本，我们都会发布两个Docker镜像：纯CPU的和GPU的。
-我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像：
-`paddledev/paddle:0.10.0rc1-cpu` 和 `paddledev/paddle:0.10.0rc1-gpu`。
+对于每一个PaddlePaddle版本，我们都会发布两种Docker镜像：开发镜像、运行镜像。运行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。
+我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
+1. 开发镜像(纯CPU版本)：:code:`paddlepaddle/paddle:<version>-dev`
+    这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
+    文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
+    开发镜像包含了以下工具：
+    - gcc/clang
+    - nvcc
+    - Python
+    - sphinx
+    - woboq
+    - sshd
+    很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
+    也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
 
-以交互容器方式运行纯CPU的镜像：
+    以交互容器方式运行开发镜像：
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+        docker run -it --rm paddledev/paddle:<version>-dev /bin/bash
 
-或者，可以以后台进程方式运行容器：
+    或者，可以以后台进程方式运行容器：
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+        docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
 
-然后用密码 :code:`root` SSH进入容器：
+    然后用密码 :code:`root` SSH进入容器：
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    ssh -p 2202 root@localhost
+        ssh -p 2202 root@localhost
 
-SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
+    SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
 
+2. 运行镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
+    - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+    - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+    - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+    - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
 
-以上方法在GPU镜像里也能用－只是请不要忘记按装CUDA驱动，以及告诉Docker：
+    纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+       if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
+    如果输出是No，就需要选择使用no-AVX的镜像
 
-运行PaddlePaddle书籍
----------------------
+    注意：在运行GPU版本的镜像时 安装CUDA驱动，以及告诉Docker：
 
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+    .. code-block:: bash
 
-PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
-如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
+        export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+        export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+        docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
 
-当您进入容器内之后，只用运行以下命令：
+3. 使用运行镜像发布你的AI程序
+    假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
 
-.. code-block:: bash
-        
-    jupyter notebook
+    .. code-block:: bash
 
-然后在浏览器中输入以下网址：
-    
-.. code-block:: text
+        docker run -it -v $PWD:/work paddle /work/a.py
 
-    http://localhost:8888/
+    这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
+    创建和发布自己的AI程序镜像。
 
-就这么简单，享受您的旅程！
+运行PaddlePaddle书籍
+---------------------
 
+Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
 
-非AVX镜像
----------
+PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
 
-纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
+我们提供可以直接运行PaddlePaddle书籍的docker镜像，直接运行：
 
 .. code-block:: bash
 
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+    docker run -p 8888:8888 paddlepaddle/book:<version>
 
-如果输出是No，我们就需要手动编译一个非AVX版本的镜像：
+然后在浏览器中输入以下网址：
 
-.. code-block:: bash
+.. code-block:: text
 
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+    http://localhost:8888/
 
+就这么简单，享受您的旅程！
 
 通过Docker容器开发PaddlePaddle
 ------------------------------
 
-开发人员可以在Docker中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+
+1. 构建开发镜像
 
-1. 将开发环境构建为Docker镜像
-   
    .. code-block:: bash
 
       git clone --recursive https://github.com/PaddlePaddle/Paddle
       cd Paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+      docker build -t paddle:dev .
 
 
-   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要设置一个参数：
+   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要执行：
 
    .. code-block:: bash
 
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+      docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
 
 
 2. 运行开发环境
 
    当我们编译好了 :code:`paddle:dev`， 我们可以在docker容器里做开发，源代码可以通过挂载本地文件来被载入Docker的开发环境里面：
-   
+
    .. code-block:: bash
 
-      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev sshd
 
    以上代码会启动一个带有PaddlePaddle开发环境的docker容器，源代码会被挂载到 :code:`/paddle` 。
 
-   请注意， :code:`paddle:dev` 的默认入口是 :code:`sshd` 。以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
-   
+   以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
+
    .. code-block:: bash
 
       ssh root@localhost -p 2202
@@ -124,13 +136,13 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
 3. 在Docker开发环境中编译与安装PaddlPaddle代码
 
    当在容器里面的时候，可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle：
-   
+
    .. code-block:: bash
-		      
+
       /paddle/paddle/scripts/docker/build.sh
 
    以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试：
-   
+
    .. code-block:: bash
 
       cd /paddle/build
@@ -140,14 +152,14 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
 文档
 ----
 
-Paddle的Docker镜像带有一个通过 `woboq code browser
+Paddle的Docker开发镜像带有一个通过 `woboq code browser
 <https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
 只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
+   docker run -d --name paddle-cpu-doc paddle:<version>-dev
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 606746597a..80782b61d5 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -12,44 +12,91 @@ of your hardware resource on Mac OS X and Windows.
 Usage of CPU-only and GPU Images
 ----------------------------------
 
-For each version of PaddlePaddle, we release 2 Docker images, a
-CPU-only one and a CUDA GPU one.  We do so by configuring
+For each version of PaddlePaddle, we release 2 types of Docker images: development
+image and production image. Production image includes CPU-only version and a CUDA
+GPU version and their no-AVX versions. We do so by configuring
 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
-automatically generate the latest docker images `paddledev/paddle:0.10.0rc1-cpu`
-and `paddledev/paddle:0.10.0rc1-gpu`.
+automatically generate the latest docker images. You can find the latest versions
+under "tags" tab at dockerhub.com.
+1. development image(support pure cpu) :code:`paddlepaddle/paddle:<version>-dev`
+    This image has packed related develop tools and runtime environment. Users and
+    developers can use this image instead of their own local computer to accomplish
+    development, build, releasing, document writing etc. While different version of
+    paddle may depends on different version of libraries and tools, if you want to
+    setup a local environment, you must pay attention to the versions.
+    The development image contains:
+    - gcc/clang
+    - nvcc
+    - Python
+    - sphinx
+    - woboq
+    - sshd
+    Many developers use servers with GPUs, they can use ssh to login to the server
+    and run :code:`docker exec` to enter the docker container and start their work.
+    Also they can start a development docker image with SSHD service, so they can login to
+    the container and start work.
 
-To run the CPU-only image as an interactive container:
+    To run the CPU-only image as an interactive container:
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+        docker run -it --rm paddledev/paddle:<version> /bin/bash
 
-or, we can run it as a daemon container
+    or, we can run it as a daemon container
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+        docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>
 
-and SSH to this container using password :code:`root`:
+    and SSH to this container using password :code:`root`:
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    ssh -p 2202 root@localhost
+        ssh -p 2202 root@localhost
 
-An advantage of using SSH is that we can connect to PaddlePaddle from
-more than one terminals.  For example, one terminal running vi and
-another one running Python interpreter.  Another advantage is that we
-can run the PaddlePaddle container on a remote server and SSH to it
-from a laptop.
+    An advantage of using SSH is that we can connect to PaddlePaddle from
+    more than one terminals.  For example, one terminal running vi and
+    another one running Python interpreter.  Another advantage is that we
+    can run the PaddlePaddle container on a remote server and SSH to it
+    from a laptop.
 
-Above methods work with the GPU image too -- just please don't forget
-to install CUDA driver and let Docker knows about it:
 
-.. code-block:: bash
+2. Production images, this image might have multiple variants:
+    - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+    - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+    - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+    - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
+
+    Please be aware that the CPU-only and the GPU images both use the AVX
+    instruction set, but old computers produced before 2008 do not support
+    AVX.  The following command checks if your Linux computer supports
+    AVX:
+
+    .. code-block:: bash
+
+       if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+
+       If it doesn't, we will use the non-AVX images.
+
+    Notice please don't forget
+    to install CUDA driver and let Docker knows about it:
+
+    .. code-block:: bash
+
+        export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+        export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+        docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
+
+
+3. Use production image to release you AI application
+    Suppose that we have a simple application program in :code:`a.py`, we can test and run it using the production image:
+
+    ```bash
+    docker run -it -v $PWD:/work paddle /work/a.py
+    ```
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+    But this works only if all dependencies of :code:`a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
 
 
 PaddlePaddle Book
@@ -59,50 +106,24 @@ The Jupyter Notebook is an open-source web application that allows
 you to create and share documents that contain live code, equations,
 visualizations and explanatory text in a single browser.
 
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. 
+PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
 We already exposed port 8888 for this book. If you want to
 dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
 
-Once you are inside the container, simply issue the command:
+We provide a packaged book image, simply issue the command:
 
 .. code-block:: bash
-        
-    jupyter notebook
+
+    docker run -p 8888:8888 paddlepaddle/book:<version>
 
 Then, you would back and paste the address into the local browser:
-    
+
 .. code-block:: text
 
     http://localhost:8888/
 
 That's all. Enjoy your journey!
 
-
-Non-AVX Images
---------------
-
-Please be aware that the CPU-only and the GPU images both use the AVX
-instruction set, but old computers produced before 2008 do not support
-AVX.  The following command checks if your Linux computer supports
-AVX:
-
-.. code-block:: bash
-
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-
-If it doesn't, we will need to build non-AVX images manually from
-source code:
-
-.. code-block:: bash
-
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
-
-
 Development Using Docker
 ------------------------
 
@@ -110,14 +131,13 @@ Developers can work on PaddlePaddle using Docker.  This allows
 developers to work on different platforms -- Linux, Mac OS X, and
 Windows -- in a consistent way.
 
-1. Build the Development Environment as a Docker Image
+1. Build the Development Docker Image
 
    .. code-block:: bash
 
       git clone --recursive https://github.com/PaddlePaddle/Paddle
       cd Paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
-
+      docker build -t paddle:dev .
 
    Note that by default :code:`docker build` wouldn't import source
    tree into the image and build it.  If we want to do that, we need
@@ -125,7 +145,7 @@ Windows -- in a consistent way.
 
    .. code-block:: bash
 
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+      docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
 
 
 2. Run the Development Environment
@@ -136,14 +156,13 @@ Windows -- in a consistent way.
 
    .. code-block:: bash
 
-      docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev
+      docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev sshd
 
    This runs a container of the development environment Docker image
    with the local source tree mounted to :code:`/paddle` of the
    container.
 
-   Note that the default entry-point of :code:`paddle:dev` is
-   :code:`sshd`, and above :code:`docker run` commands actually starts
+   The above :code:`docker run` commands actually starts
    an SSHD server listening on port 2202.  This allows us to log into
    this container with:
 
@@ -191,7 +210,7 @@ container:
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
+   docker run -d --name paddle-cpu-doc paddle:<version>
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 

From b7a809a110832568694687d01609753302114570 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 22 Mar 2017 16:45:39 +0800
Subject: [PATCH 06/20] add download api for dataset

---
 python/paddle/v2/dataset/cifar.py       |  5 +++++
 python/paddle/v2/dataset/conll05.py     |  8 ++++++++
 python/paddle/v2/dataset/imdb.py        |  4 ++++
 python/paddle/v2/dataset/imikolov.py    |  4 ++++
 python/paddle/v2/dataset/mnist.py       |  5 +++++
 python/paddle/v2/dataset/movielens.py   | 12 ++++++++----
 python/paddle/v2/dataset/sentiment.py   |  4 ++++
 python/paddle/v2/dataset/uci_housing.py |  4 ++++
 python/paddle/v2/dataset/wmt14.py       |  4 ++++
 9 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index d9f7a830ee..f8b97f7c1f 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -75,3 +75,8 @@ def test10():
     return reader_creator(
         paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
         'test_batch')
+
+
+def download():
+    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 9eab49ee39..d39f81e412 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -196,3 +196,11 @@ def test():
         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
     return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def download():
+    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 76019d9f54..c4c6d738ed 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -123,3 +123,7 @@ def test(word_idx):
 def word_dict():
     return build_dict(
         re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def download():
+    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index 97c160f111..db83361d2f 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -89,3 +89,7 @@ def train(word_idx, n):
 
 def test(word_idx, n):
     return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
+
+
+def download():
+    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 16f2fcb99d..7b536bfa03 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -106,3 +106,8 @@ def test():
                                           TEST_IMAGE_MD5),
         paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
                                           TEST_LABEL_MD5), 100)
+
+
+def download():
+    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index 25fd8227da..ced1b4c273 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -30,6 +30,9 @@ __all__ = [
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
 
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
 
 class MovieInfo(object):
     def __init__(self, index, categories, title):
@@ -77,10 +80,7 @@ USER_INFO = None
 
 
 def __initialize_meta_info__():
-    fn = download(
-        url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
-        module_name='movielens',
-        md5sum='c4d9eecfca2ab87c1945afe126590906')
+    fn = download(URL, "movielens", MD5)
     global MOVIE_INFO
     if MOVIE_INFO is None:
         pattern = re.compile(r'^(.*)\((\d+)\)$')
@@ -205,5 +205,9 @@ def unittest():
     print train_count, test_count
 
 
+def download():
+    paddle.v2.dataset.common.download(URL, "movielens", MD5)
+
+
 if __name__ == '__main__':
     unittest()
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index 71689fd61b..b1d517fc6a 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -125,3 +125,7 @@ def test():
     """
     data_set = load_sentiment_data()
     return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def download():
+    nltk.download('movie_reviews', download_dir=common.DATA_HOME)
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 27f454b137..52bfc9e338 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -89,3 +89,7 @@ def test():
             yield d[:-1], d[-1:]
 
     return reader
+
+
+def download():
+    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index c686870a49..a5b4ea9862 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -103,3 +103,7 @@ def test(dict_size):
     return reader_creator(
         paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
         'test/test', dict_size)
+
+
+def download():
+    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)

From 7ca0c6ccff582e15998646df075ba220160e91e5 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 22 Mar 2017 17:10:43 +0800
Subject: [PATCH 07/20] separate build and run tests

---
 paddle/scripts/docker/README.md | 5 +++--
 paddle/scripts/docker/build.sh  | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index e5af5c9a1e..44e570cfde 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -94,7 +94,7 @@ docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com
 Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
 
 ```bash
-docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
+docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev
 ```
 
 This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
@@ -108,7 +108,8 @@ This command mounts the source directory on the host into `/paddle` in the conta
 Users can specify the following Docker build arguments with either "ON" or "OFF" value:
 - `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
 - `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
-- `TEST`: ***Optional, default OFF***. Build unit tests and run them after building.
+- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries.
+- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
 
 ### Build the Production Docker Image
 
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 405d3338af..a0da561dfe 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -33,10 +33,10 @@ cmake .. \
       -DWITH_SWIG_PY=ON \
       -DCUDNN_ROOT=/usr/ \
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
-      -DON_COVERALLS=${TEST:-OFF} \
+      -DON_COVERALLS=${WITH_TEST:-OFF} \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 make -j `nproc`
-if [[ ${TEST:-OFF} == "ON" ]]; then
+if [[ ${RUN_TEST:-OFF} == "ON" ]]; then
     make coveralls
 fi
 make install

From 57c355a117534fab2a4bebc6aa30a7ffa83feead Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Wed, 22 Mar 2017 17:41:43 +0800
Subject: [PATCH 08/20] Remove redundancy codes

---
 paddle/gserver/layers/CrossChannelNormLayer.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
index ced7199991..dd3612c49d 100644
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -21,7 +21,6 @@ namespace paddle {
 
 void CrossChannelNormLayer::forward(PassType passType) {
   Layer::forward(passType);
-  auto in = getInput(0);
   MatrixPtr inV = getInputValue(0);
 
   size_t batchSize = inV->getHeight();
@@ -36,7 +35,6 @@ void CrossChannelNormLayer::forward(PassType passType) {
   Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
   Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
   normBuffer_->zeroMem();
-  spatialBuffer_->zeroMem();
   dataBuffer_->zeroMem();
   // add eps to avoid overflow
   normBuffer_->addScalar(*normBuffer_, 1e-6);
@@ -71,7 +69,6 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
   MatrixPtr outG = getOutputGrad();
   MatrixPtr outV = getOutputValue();
 
-  auto in = getInput(0);
   size_t batchSize = inG->getHeight();
   size_t dataDim = inG->getWidth();
   size_t spatialDim = dataDim / channels_;

From 784e242bd5d4b323031ba0618e82a734194edf72 Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Wed, 22 Mar 2017 19:26:59 +0800
Subject: [PATCH 09/20] Remove redundancy codes

---
 doc/api/v2/config/layer.rst                     | 6 ++++++
 paddle/gserver/layers/CrossChannelNormLayer.cpp | 2 --
 paddle/gserver/layers/NormLayer.h               | 9 ++++-----
 python/paddle/trainer_config_helpers/layers.py  | 1 +
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index db33a20487..8a8774bd69 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -109,6 +109,12 @@ sum_to_one_norm
     :members: sum_to_one_norm
     :noindex:
     
+cross_channel_norm
+---------------
+..  automodule:: paddle.v2.layer
+    :members: cross_channel_norm
+    :noindex:
+    
 Recurrent Layers
 ================
 
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
index dd3612c49d..0bc90966e2 100644
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -40,7 +40,6 @@ void CrossChannelNormLayer::forward(PassType passType) {
   normBuffer_->addScalar(*normBuffer_, 1e-6);
   inV->square2(*dataBuffer_);
   for (size_t i = 0; i < batchSize; i++) {
-    spatialBuffer_->zeroMem();
     MatrixPtr inTmp = Matrix::create(
         inV->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
     MatrixPtr dataTmp = Matrix::create(dataBuffer_->getData() + i * dataDim,
@@ -80,7 +79,6 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
   scaleDiff_->zeroMem();
   for (size_t i = 0; i < batchSize; i++) {
     spatialBuffer_->zeroMem();
-    channelBuffer_->zeroMem();
     // propagate to param.
     MatrixPtr dataBufferTmp =
         Matrix::create(dataBuffer_->getData() + i * dataDim,
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 59ba226dfe..d896abbd75 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -66,11 +66,10 @@ public:
 };
 
 /**
- * This layer applys normalize across the channels of each sample to a
- * conv layer's output and scale the output by a group of trainable factors
- * which dimensions equal to the channel's number.
- * - Input: One and only one input layer are accepted. The input layer must be
- *        be a data output layer.
+ * This layer applys normalization across the channels of each sample to a
+ * conv layer's output, and scales the output by a group of trainable factors
+ * whose equal to the number of channels.
+ * - Input: One and only one input layer are accepted.
  * - Output: The normalized data of the input data.
  * Reference:
  *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index cac71539f5..df91a6d8cf 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1015,6 +1015,7 @@ def cross_channel_norm_layer(input, name=None, param_attr=None):
     This layer applys normalize across the channels of each sample to
     a conv layer's output and scale the output by a group of trainable
     factors which dimensions equal to the channel's number.
+
     :param name: The Layer Name.
     :type name: basestring
     :param input: The input layer.

From 17c697c7541390179aaf3aa02391b8e453acb284 Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Wed, 22 Mar 2017 19:52:37 +0800
Subject: [PATCH 10/20] Remove redundancy codes

---
 doc/api/v2/config/layer.rst                     | 2 +-
 paddle/gserver/layers/CrossChannelNormLayer.cpp | 1 -
 paddle/gserver/layers/NormLayer.h               | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 8a8774bd69..05817ec854 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -110,7 +110,7 @@ sum_to_one_norm
     :noindex:
     
 cross_channel_norm
----------------
+------------------
 ..  automodule:: paddle.v2.layer
     :members: cross_channel_norm
     :noindex:
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
index 0bc90966e2..0c8156ae77 100644
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -78,7 +78,6 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
   Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
   scaleDiff_->zeroMem();
   for (size_t i = 0; i < batchSize; i++) {
-    spatialBuffer_->zeroMem();
     // propagate to param.
     MatrixPtr dataBufferTmp =
         Matrix::create(dataBuffer_->getData() + i * dataDim,
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index d896abbd75..f490f506a9 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -68,7 +68,7 @@ public:
 /**
  * This layer applys normalization across the channels of each sample to a
  * conv layer's output, and scales the output by a group of trainable factors
- * whose equal to the number of channels.
+ * whose dimensions equal to the number of channels.
  * - Input: One and only one input layer are accepted.
  * - Output: The normalized data of the input data.
  * Reference:

From 3bce32bae2625cdfb08300bf6d3b4cc65e0da2c1 Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Thu, 23 Mar 2017 11:17:27 +0800
Subject: [PATCH 11/20] Add create matrix pointer funtion

---
 .../gserver/layers/CrossChannelNormLayer.cpp  | 78 ++++++++++---------
 paddle/gserver/layers/NormLayer.h             |  3 +-
 2 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
index 0c8156ae77..4c95274293 100644
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -19,6 +19,23 @@ limitations under the License. */
 
 namespace paddle {
 
+MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
+                                                    size_t iter,
+                                                    size_t spatialDim) {
+  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
+                        channels_,
+                        spatialDim,
+                        false,
+                        useGpu_);
+}
+
+MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
+                                                     size_t iter,
+                                                     size_t spatialDim) {
+  return Matrix::create(
+      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
+}
+
 void CrossChannelNormLayer::forward(PassType passType) {
   Layer::forward(passType);
   MatrixPtr inV = getInputValue(0);
@@ -40,25 +57,19 @@ void CrossChannelNormLayer::forward(PassType passType) {
   normBuffer_->addScalar(*normBuffer_, 1e-6);
   inV->square2(*dataBuffer_);
   for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr inTmp = Matrix::create(
-        inV->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
-    MatrixPtr dataTmp = Matrix::create(dataBuffer_->getData() + i * dataDim,
-                                       channels_,
-                                       spatialDim,
-                                       false,
-                                       useGpu_);
-    MatrixPtr outTmp = Matrix::create(
-        outV->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
-    MatrixPtr normTmp = Matrix::create(
-        normBuffer_->getData() + i * spatialDim, 1, spatialDim, false, useGpu_);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
+    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
     // compute norm.
-    spatialBuffer_->sumCols(*dataTmp, 1, 1);
+    spatialBuffer_->sumCols(*dataTmp, 1, 0);
     spatialBuffer_->sqrt2(*spatialBuffer_);
     normTmp->copyFrom(*spatialBuffer_);
-    outTmp->copyFrom(*inTmp);
-    outTmp->divRowVector(*spatialBuffer_);
+    outVTmp->copyFrom(*inVTmp);
+    outVTmp->divRowVector(*spatialBuffer_);
     // scale the layer.
-    outTmp->mulColVector(*scale_->getW());
+    outVTmp->mulColVector(*scale_->getW());
   }
 }
 
@@ -78,40 +89,31 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
   Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
   scaleDiff_->zeroMem();
   for (size_t i = 0; i < batchSize; i++) {
-    // propagate to param.
-    MatrixPtr dataBufferTmp =
-        Matrix::create(dataBuffer_->getData() + i * dataDim,
-                       channels_,
-                       spatialDim,
-                       false,
-                       useGpu_);
-    const MatrixPtr inValueTmp = Matrix::create(
-        inV->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
-    const MatrixPtr outGradTmp = Matrix::create(
-        outG->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
-    MatrixPtr inGradTmp = Matrix::create(
-        inG->getData() + i * dataDim, channels_, spatialDim, false, useGpu_);
-    const MatrixPtr normTmp = Matrix::create(
-        normBuffer_->getData() + i * spatialDim, 1, spatialDim, false, useGpu_);
-    channelBuffer_->sumRows(*dataBufferTmp, 1, 1);
+    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
+    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    channelBuffer_->sumRows(*dataTmp, 1, 0);
     channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
     // store a / scale[i] in scaleDiff_ temporary
     scaleDiff_->add(*channelBuffer_, 1.);
 
-    sampleBuffer_->dotMul(*inValueTmp, *outGradTmp);
+    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
     spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
     // scale the grad
-    inGradTmp->copyFrom(*inValueTmp);
-    inGradTmp->mulRowVector(*spatialBuffer_);
+    inGTmp->copyFrom(*inVTmp);
+    inGTmp->mulRowVector(*spatialBuffer_);
     // divide by square of norm
     spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGradTmp->divRowVector(*spatialBuffer_);
+    inGTmp->divRowVector(*spatialBuffer_);
     // subtract
-    inGradTmp->add(*outGradTmp, -1, 1);
+    inGTmp->add(*outGTmp, -1, 1);
     // divide by norm
-    inGradTmp->divRowVector(*normTmp);
+    inGTmp->divRowVector(*normTmp);
     // scale the diff
-    inGradTmp->mulColVector(*scale_->getW());
+    inGTmp->mulColVector(*scale_->getW());
   }
   // updata scale
   if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index f490f506a9..7c238ac944 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -80,9 +80,10 @@ public:
   explicit CrossChannelNormLayer(const LayerConfig& config)
       : NormLayer(config) {}
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
   void forward(PassType passType);
   void backward(const UpdateCallback& callback);
+  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
 
 protected:
   size_t channels_;

From b8c33646c7c6c9c56cc38312a837c7533047d2a1 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Thu, 23 Mar 2017 11:49:59 +0800
Subject: [PATCH 12/20] update

---
 .../build_and_install/docker_install_cn.rst      | 11 ++++++-----
 .../build_and_install/docker_install_en.rst      | 16 ++++++++--------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 4150b18b2e..eb6e400828 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -8,8 +8,9 @@ PaddlePaddle发布的docker镜像使用说明
 ------------------------------
 
 对于每一个PaddlePaddle版本，我们都会发布两种Docker镜像：开发镜像、运行镜像。运行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。
-我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
-1. 开发镜像(纯CPU版本)：:code:`paddlepaddle/paddle:<version>-dev`
+我们会在 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新的docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
+1. 开发镜像：:code:`paddlepaddle/paddle:<version>-dev`
+
     这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
     文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
     开发镜像包含了以下工具：
@@ -62,7 +63,7 @@ PaddlePaddle发布的docker镜像使用说明
 
         export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
         export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-        docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+        docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
 
 3. 使用运行镜像发布你的AI程序
     假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
@@ -86,7 +87,7 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
 
 .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book:<version>
+    docker run -p 8888:8888 paddlepaddle/book
 
 然后在浏览器中输入以下网址：
 
@@ -110,7 +111,7 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
       docker build -t paddle:dev .
 
 
-   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要执行：
+   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要构建完开发镜像，然后执行：
 
    .. code-block:: bash
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 80782b61d5..0df8508ace 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -14,11 +14,11 @@ Usage of CPU-only and GPU Images
 
 For each version of PaddlePaddle, we release 2 types of Docker images: development
 image and production image. Production image includes CPU-only version and a CUDA
-GPU version and their no-AVX versions. We do so by configuring
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
-automatically generate the latest docker images. You can find the latest versions
-under "tags" tab at dockerhub.com.
-1. development image(support pure cpu) :code:`paddlepaddle/paddle:<version>-dev`
+GPU version and their no-AVX versions. We put the docker images on
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
+latest versions under "tags" tab at dockerhub.com.
+1. development image :code:`paddlepaddle/paddle:<version>-dev`
+
     This image has packed related develop tools and runtime environment. Users and
     developers can use this image instead of their own local computer to accomplish
     development, build, releasing, document writing etc. While different version of
@@ -114,7 +114,7 @@ We provide a packaged book image, simply issue the command:
 
 .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book:<version>
+    docker run -p 8888:8888 paddlepaddle/book
 
 Then, you would back and paste the address into the local browser:
 
@@ -140,8 +140,8 @@ Windows -- in a consistent way.
       docker build -t paddle:dev .
 
    Note that by default :code:`docker build` wouldn't import source
-   tree into the image and build it.  If we want to do that, we need
-   to set a build arg:
+   tree into the image and build it.  If we want to do that, we need docker the
+   development docker image and then run the following command:
 
    .. code-block:: bash
 

From cc82c27544b95eaafcd23c24a6f5f59c783eb50a Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Thu, 23 Mar 2017 11:56:04 +0800
Subject: [PATCH 13/20] update

---
 paddle/scripts/docker/README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index 44e570cfde..7c90316ad8 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -108,7 +108,10 @@ This command mounts the source directory on the host into `/paddle` in the conta
 Users can specify the following Docker build arguments with either "ON" or "OFF" value:
 - `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
 - `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
-- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries.
+- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
+  ```bash
+    docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
+  ```
 - `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
 
 ### Build the Production Docker Image

From 21b7f4a60417a8182001576e73e6b8935ff8c3c2 Mon Sep 17 00:00:00 2001
From: gaoyuan <yuan.gao@noplz.name>
Date: Thu, 23 Mar 2017 12:04:53 +0800
Subject: [PATCH 14/20] Remove redundancy codes

---
 paddle/gserver/layers/CrossChannelNormLayer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
index 4c95274293..3fbccc1103 100644
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -52,7 +52,6 @@ void CrossChannelNormLayer::forward(PassType passType) {
   Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
   Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
   normBuffer_->zeroMem();
-  dataBuffer_->zeroMem();
   // add eps to avoid overflow
   normBuffer_->addScalar(*normBuffer_, 1e-6);
   inV->square2(*dataBuffer_);

From 7b72c7926484905be37d10aa0bcfdada01f48d4e Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 24 Mar 2017 09:52:26 +0800
Subject: [PATCH 15/20] update

---
 python/paddle/v2/dataset/cifar.py       | 2 +-
 python/paddle/v2/dataset/conll05.py     | 2 +-
 python/paddle/v2/dataset/imdb.py        | 2 +-
 python/paddle/v2/dataset/imikolov.py    | 2 +-
 python/paddle/v2/dataset/mnist.py       | 2 +-
 python/paddle/v2/dataset/movielens.py   | 2 +-
 python/paddle/v2/dataset/sentiment.py   | 4 ++--
 python/paddle/v2/dataset/uci_housing.py | 2 +-
 python/paddle/v2/dataset/wmt14.py       | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index f8b97f7c1f..c38caf7a89 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -77,6 +77,6 @@ def test10():
         'test_batch')
 
 
-def download():
+def fetch_data():
     paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
     paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index d39f81e412..dede47cbd3 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -198,7 +198,7 @@ def test():
     return reader_creator(reader, word_dict, verb_dict, label_dict)
 
 
-def download():
+def fetch_data():
     paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
     paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
     paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index c4c6d738ed..05e67f39d1 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -125,5 +125,5 @@ def word_dict():
         re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
 
 
-def download():
+def fetch_data():
     paddle.v2.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index db83361d2f..b89628cea5 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -91,5 +91,5 @@ def test(word_idx, n):
     return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
 
 
-def download():
+def fetch_data():
     paddle.v2.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 7b536bfa03..8f1ce4df25 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -108,6 +108,6 @@ def test():
                                           TEST_LABEL_MD5), 100)
 
 
-def download():
+def fetch_data():
     paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index ced1b4c273..cebf8a13ec 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -205,7 +205,7 @@ def unittest():
     print train_count, test_count
 
 
-def download():
+def fetch_data():
     paddle.v2.dataset.common.download(URL, "movielens", MD5)
 
 
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index b1d517fc6a..3183288a7a 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -26,7 +26,7 @@ from itertools import chain
 import nltk
 from nltk.corpus import movie_reviews
 
-import common
+import paddle.v2.dataset.common
 
 __all__ = ['train', 'test', 'get_word_dict']
 NUM_TRAINING_INSTANCES = 1600
@@ -127,5 +127,5 @@ def test():
     return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
 
 
-def download():
+def fetch_data():
     nltk.download('movie_reviews', download_dir=common.DATA_HOME)
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 52bfc9e338..5066c0886b 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -91,5 +91,5 @@ def test():
     return reader
 
 
-def download():
+def fetch_data():
     paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index a5b4ea9862..759c77ccfd 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -105,5 +105,5 @@ def test(dict_size):
         'test/test', dict_size)
 
 
-def download():
+def fetch_data():
     paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)

From 14eb5b8e104d89bdb82be40e31acb5a0f7bc79e0 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 24 Mar 2017 10:51:12 +0800
Subject: [PATCH 16/20] rename fetch_all to fetch; add fetch_all function

---
 python/paddle/v2/dataset/cifar.py       | 21 ++++++++-------------
 python/paddle/v2/dataset/common.py      | 12 ++++++++++++
 python/paddle/v2/dataset/conll05.py     | 12 ++++++------
 python/paddle/v2/dataset/imdb.py        |  2 +-
 python/paddle/v2/dataset/imikolov.py    |  2 +-
 python/paddle/v2/dataset/mnist.py       |  4 +++-
 python/paddle/v2/dataset/movielens.py   |  4 ++--
 python/paddle/v2/dataset/sentiment.py   |  4 ++--
 python/paddle/v2/dataset/uci_housing.py |  4 ++--
 python/paddle/v2/dataset/wmt14.py       | 12 +++++-------
 10 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index c38caf7a89..3a8b98b8f0 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -20,7 +20,7 @@ TODO(yuyang18): Complete the comments.
 import cPickle
 import itertools
 import numpy
-import paddle.v2.dataset.common
+from common import download
 import tarfile
 
 __all__ = ['train100', 'test100', 'train10', 'test10']
@@ -55,28 +55,23 @@ def reader_creator(filename, sub_name):
 
 def train100():
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'train')
+        download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
 
 
 def test100():
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'test')
+    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
 
 
 def train10():
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
 
 
 def test10():
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
 
 
-def fetch_data():
-    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
-    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+def fetch():
+    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 3021b68ddb..7021a6da05 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -17,6 +17,8 @@ import hashlib
 import os
 import shutil
 import sys
+import importlib
+import paddle.v2.dataset
 
 __all__ = ['DATA_HOME', 'download', 'md5file']
 
@@ -69,3 +71,13 @@ def dict_add(a_dict, ele):
         a_dict[ele] += 1
     else:
         a_dict[ele] = 1
+
+
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "fetch")()
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index dede47cbd3..f1b0ce16f2 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -198,9 +198,9 @@ def test():
     return reader_creator(reader, word_dict, verb_dict, label_dict)
 
 
-def fetch_data():
-    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
-    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
-    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
-    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
-    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+def fetch():
+    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    download(EMB_URL, 'conll05st', EMB_MD5)
+    download(DATA_URL, 'conll05st', DATA_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 05e67f39d1..5284017ce0 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -125,5 +125,5 @@ def word_dict():
         re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
 
 
-def fetch_data():
+def fetch():
     paddle.v2.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index b89628cea5..2931d06e7e 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -91,5 +91,5 @@ def test(word_idx, n):
     return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
 
 
-def fetch_data():
+def fetch():
     paddle.v2.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 8f1ce4df25..48a39b5493 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -108,6 +108,8 @@ def test():
                                           TEST_LABEL_MD5), 100)
 
 
-def fetch_data():
+def fetch():
     paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index cebf8a13ec..e148ddeca0 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -205,8 +205,8 @@ def unittest():
     print train_count, test_count
 
 
-def fetch_data():
-    paddle.v2.dataset.common.download(URL, "movielens", MD5)
+def fetch():
+    download(URL, "movielens", MD5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index 3183288a7a..0eeb6d5aff 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -26,7 +26,7 @@ from itertools import chain
 import nltk
 from nltk.corpus import movie_reviews
 
-import paddle.v2.dataset.common
+import common
 
 __all__ = ['train', 'test', 'get_word_dict']
 NUM_TRAINING_INSTANCES = 1600
@@ -127,5 +127,5 @@ def test():
     return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
 
 
-def fetch_data():
+def fetch():
     nltk.download('movie_reviews', download_dir=common.DATA_HOME)
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 5066c0886b..dab8620441 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -91,5 +91,5 @@ def test():
     return reader
 
 
-def fetch_data():
-    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
+def fetch():
+    download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 759c77ccfd..ee63a93f5a 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -16,7 +16,7 @@ wmt14 dataset
 """
 import tarfile
 
-import paddle.v2.dataset.common
+from paddle.v2.dataset.common import download
 
 __all__ = ['train', 'test', 'build_dict']
 
@@ -95,15 +95,13 @@ def reader_creator(tar_file, file_name, dict_size):
 
 def train(dict_size):
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'train/train', dict_size)
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
 
 
 def test(dict_size):
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'test/test', dict_size)
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
 
 
-def fetch_data():
-    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+def fetch():
+    download(URL_TRAIN, 'wmt14', MD5_TRAIN)

From 5854420f186b8902e77bb7a577b7d9d612978d17 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 24 Mar 2017 15:26:00 +0800
Subject: [PATCH 17/20] Fix NOT WITH_PYTHON

---
 cmake/external/python.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 0accf1a8dd..9085e3799c 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -221,3 +221,7 @@ ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
 
 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+
+IF(NOT WITH_PYTHON)
+    SET(PYTHON_LIBRARIES "")
+ENDIF()
\ No newline at end of file

From df9993fd2cabe2f2fc88b14b8d8fa96096cfe1c8 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 24 Mar 2017 15:26:40 +0800
Subject: [PATCH 18/20] add blank

---
 cmake/external/python.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 9085e3799c..93d7275df0 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -224,4 +224,4 @@ INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
 
 IF(NOT WITH_PYTHON)
     SET(PYTHON_LIBRARIES "")
-ENDIF()
\ No newline at end of file
+ENDIF()

From e821c5547fb2c51758c95c18b39ee8202d186d0a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 27 Mar 2017 13:08:40 +0800
Subject: [PATCH 19/20] Fix rdma links bug

---
 paddle/api/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 3760c6727c..4d0dacae90 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -76,8 +76,6 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${CMAKE_DL_LIBS}
     ${EXTERNAL_LIBS}
     ${CMAKE_THREAD_LIBS_INIT}
-    ${RDMA_LD_FLAGS}
-    ${RDMA_LIBS}
     ${START_END}
 )
 

From 137d2cb24fda1aa7e274c20ff151194c046a46b2 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 27 Mar 2017 16:39:31 +0800
Subject: [PATCH 20/20] remove with_metric_learning

---
 doc/getstarted/build_and_install/ubuntu_install_cn.rst | 1 -
 paddle/scripts/submit_local.sh.in                      | 1 -
 2 files changed, 2 deletions(-)

diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
index d02d9c63bb..9e39ccb00f 100644
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -46,7 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
         with_double: OFF
         with_python: ON
         with_rdma: OFF
-        with_metric_learning:
         with_timer: OFF
         with_predict_sdk:
 
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 5a45df4072..0c0fea29df 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -21,7 +21,6 @@ function version(){
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_metric_learning: @WITH_METRIC@"
         echo "    with_timer: @WITH_TIMER@"
         echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
 }