From f35c8c42604bd06dbb964a4c26e9ec9d4a2cb94d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sun, 2 Jul 2017 17:10:05 +0800
Subject: [PATCH 001/100] remove simple_op_design.md

---
 doc/design/simple_op_design.md | 273 ---------------------------------
 1 file changed, 273 deletions(-)
 delete mode 100644 doc/design/simple_op_design.md

diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md
deleted file mode 100644
index 93c0f68ca9..0000000000
--- a/doc/design/simple_op_design.md
+++ /dev/null
@@ -1,273 +0,0 @@
-## Interaction between C++ and Python
-
-Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. 
-
-The Interaction between Python and C++ can be simplified as two steps:
-
-1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
-
-2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ fo finish Op construction task.
-
-### Message form C++ to Python
-
-We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
-
-Following message are necessary:
-
-1. Op's name, and its simple comment.
-2. Input and output variable number; each variable's name, type, and comment.
-3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**.
-
-So `OpProto` can be defined as follows:
-
-```proto
-enum AttrType {
-	INT = 1;
-	FLOAT = 2;
-	STRING = 3;
-	INTS = 4;
-	FLOATS = 5;
-	STRINGS = 6;
-};
-
-message AttrValue {
-	AttrType type = 1;
-	optional int iv = 2;
-	optional float fv = 3;
-	optional string sv = 4;
-	repeated int ivs = 5;
-	repeated float fvs = 6;
-	repeated string svs = 7;
-};
-
-message AttrProto {
-	required string name = 1;
-	required string comment = 2;
-	optional AttrValue default = 3;
-	optional AttrValue max = 4;
-	optional AttrValue min = 5;
-	required AttrType type = 6;
-};
-
-message VarProto {
-	required string name = 1;
-	required string comment = 2;
-};
-
-message OpProto {
-	repeated VarProto inputs = 1;
-	repeated VarProto outputs = 2;
-	repeated AttrProto attrs = 3;
-	required string type = 4;
-	required string comment = 5;
-};
-```
-
-The default value and value range didn't appear in out previous design. By adding these two fields, we are able to check attribute validity in Python and find out possible error as soon as possible. What's more, by providing the message about default value and value range to Python docstring, it helps to automatically generate more comprehensive documents.
-
-### Message from Python to C++
-
-To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing.
-
-```proto
-message OpDesc {
-	required string type = 1;	
-	repeated string inputs = 2;
-	repeated string outputs = 3;
-	map<string, AttrValue> attrs = 4;
-};
-```
-
-## OpProto Register
-
-Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`.
-
-```cpp
-class OpProtoMaker {
-public:
-	OpProtoMaker(OpProto* proto): proto_(proto) {}
-protected:
-	OpProto* proto_;
-	void AddInput(const std::string& name, const std::string& desc) {...}
-	void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...}
-	void AddComment(const std::string& comment) { ... }
-};
-
-class OpRegistry {
-public:
-	using OpCreator = std::function<OperatorBase* (OpDesc& desc)>;
-	
-	template <typename OpType, typename OpMaker>
-	static void RegisterOp(const std::string& name) {
-		gCreators_[name] = [](const OpDesc& desc) {
-			return new OpType(desc);
-		};
-		OpProto& opProto = gProtos_[name];
-		OpMaker()(&opProto);
-	}
-
-	static map<string, OpCreator> gCreators_;
-	static map<string, OpProto> gProtos_;
-};
-
-template <typename OpType, typename OpMaker>
-class OpRegister {
-  public:
-    OpRegister(std::string type) {
-        OpRegistry::RegisterOp<OpType, OpMaker>(type);
-    }
-};
-
-#define REGISTER_OP(op_class, op_maker_class, type_name)         \
-    class op_class##Register {                                   \
-      private:                                                   \
-        const static OpRegister<#op_class, #op_maker_class> reg; \
-    };                                                           \
-    const Register op_class##Register::reg(#type_name);
-    
-class CosineOp {
-// ...
-}
-
-struct CosineOpProtoMaker : public OpProtoMaker {
-	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
-		AddInput("input", "input of cosine op");
-		AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0);
-		AddType("cos");
-		AddComment("This is cos op");
-	}
-}
-
-REGISTER_OP(CosineOp, CosineOpProtoMaker, cos);
-```
-
-In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. 
-
-## Python API
-
-Python  APIs are divided into two types, high-level API and low-level API.
-
-### High-Level API
-
-High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs.
-
-Here is a sample about how a define a fc layer:
-
-```python
-hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid");
-```
-
-`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input.
-
-The definition of `fc_layer()`:
-
-```python
-def fc_layer(input, size, with_bias, activation):
-	attr_map = {"size":size}
-	check_attrs(attr_map)
-	w = make_variable('w')
-	if with_bias:
-		b = make_variable('b')
-	else:
-		b = None
-	fc_output = make_variable('fc_output');
-	fc_op(input, w, b, fc_output, attr_map)
-	act_output = make_variable('sigmod_output');
-	if activation == "sigmod":
-		sigmod_op(fc_output, act_output);
-	elif:
-		# ...
-	return act_output;
-``` 
-
-### Low Leval API
-
-In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code.
-
-*TODO*
-
-## Op and Kernal
-
-After completely defined, an Op will be run in a network. However, Op's computing method may differ on different devices. One solution is that write an `Op`'s member function `Op::run()`, which contains computing methods of all possible devices. That may be a bad idea because we have to change all `Op`'s code to add a new device.
-
-Another choice is adding a concept named `kernal`. A `Kernal` describes an op's computing process on a certain device. After stripping `Variable` and `kernal`, `Op` becomes a pure conceptual class, which holds neither data nor detailed computing process.
-
-```cpp
-class KernalBase {
-public:
-  virtual void RunOnDevice(std::vector<Variable*> input_vars,
-                           std::vector<Variable*> input_vars,
-                           const OpAttrs* attrs) = 0;  
-};
-
-template <typename Device>
-class CosineKernal : public KernalBase {
-public:
-  virtual void RunOnDevice(std::vector<Variable*> input_vars,
-                           std::vector<Variable*> input_vars,
-                           const OpAttrs* attrs) {
-    // no implementation
-  }
-};
-
-template <>
-class CosineKernal<CpuDevice> : public KernalBase {
-public:
-  virtual void RunOnDevice(std::vector<Variable*> input_vars,
-                           std::vector<Variable*> input_vars,
-                           const OpAttrs* attrs) {
-    CosineOpAttrs* cosine_attrs = static_cast<CosineOpAttrs*>(attrs);
-    // computing code
-    // ...
-  }
-};
-
-struct OpAttrs {...};
-
-class Op {
- public:
-   std::string get_kernal_name() {
-     return kernel_name_;
-   }
-   const vector<std::string>& get_input_names() {
-     return input_names_;
-    }
-   const vector<std::string>& get_output_names() {
-     return output_names_;
-   }
- // ...
- private:
-  std::vector<std::string> input_names_;
-  std::vector<std::string> output_names_;
-  std::string kernal_name_;
-  
-}
-
-struct CosineOpAttrs : public OpAttrs {
-  float scale_;
-}
-  
-class CosineOp : public Op {
- public:
-  const CosineOpAtrrs* get_attrs() {
-    return &attrs;
-  }
-  
- private:
-  CosineOpAttrs attrs;
-}
-
-RunOp(const Op& op, Scope scope) {
-  Kernal* kernal = get_kernal(scope, op.get_kernal_name());
-  std::vector<Variable*> input_vars = 
-               get_variables(scope, op.get_input_name());
-  std::vector<Variable*> output_vars = 
-               get_variables(scope, op.get_output_name());
-  	  
-  kernal->RunOnDevice(input_vars, output_vars, op.get_attrs());
-}
-```
-
-All `Kernal` need to be registered beforehand, just like `Op`.
-
-Now, `Op` is no longer has `Run()` function. It only contains names of variables and kernels. During network running, `RunOp()` is called to invoke `Op`'s corresponding `Kernal`. `get_kernal()` is supposed to return `kernal` for current device.

From 7dc53ea0ed08b04abf047c2827e339a766bbb983 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 3 Jul 2017 14:22:12 +0800
Subject: [PATCH 002/100] renew simple_op_design.md

---
 doc/design/simple_op_design.md | 202 +++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 doc/design/simple_op_design.md

diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md
new file mode 100644
index 0000000000..2c1c7f6f14
--- /dev/null
+++ b/doc/design/simple_op_design.md
@@ -0,0 +1,202 @@
+## Interaction between C++ and Python
+
+Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. 
+
+The Interaction between Python and C++ can be simplified as two steps:
+
+1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
+
+2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ fo finish Op construction task.
+
+### Message form C++ to Python
+
+We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
+
+Following message are necessary:
+
+1. Op's name, and its simple comment.
+2. Input and output variable number; each variable's name, type, and comment.
+3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**.
+
+So `OpProto` can be defined as follows:
+
+```proto
+enum AttrType {
+	INT = 1;
+	FLOAT = 2;
+	STRING = 3;
+	INTS = 4;
+	FLOATS = 5;
+	STRINGS = 6;
+};
+
+message AttrValue {
+	AttrType type = 1;
+	optional int iv = 2;
+	optional float fv = 3;
+	optional string sv = 4;
+	repeated int ivs = 5;
+	repeated float fvs = 6;
+	repeated string svs = 7;
+};
+
+message AttrProto {
+	required string name = 1;
+	required string comment = 2;
+	required AttrType type = 3;
+};
+
+message VarProto {
+	required string name = 1;
+	required string comment = 2;
+	required bool is_tensor = 3;
+};
+
+message OpProto {
+	repeated VarProto inputs = 1;
+	repeated VarProto outputs = 2;
+	repeated AttrProto attrs = 3;
+	required string type = 4;
+	required string comment = 5;
+};
+```
+
+To generate Python code automatically:
+
+```python
+def create_python_ops_creatation_functions():
+	op_protos = paddle.framework.OpRegistry.get_all_op_proto()
+	for type_name in op_protos:
+		op_proto = op_protos[type_name]
+		def __impl__(**kwargs):  # User must use key word args in Paddle API
+			inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs]
+			outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs]
+			attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs]
+			opdesc = （input, outputs, type_name, attrs）
+			return paddle.framework.OpRegistry.CreateOp(opdesc)
+		__impl__.__doc__ = create_doc_string(op_proto)
+		globals()[type_name] = __impl__
+
+create_python_ops_creatation_functions()
+```
+
+### Message from Python to C++
+
+To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing.
+
+```proto
+message OpDesc {
+	required string type = 1;	
+	repeated string inputs = 2;
+	repeated string outputs = 3;
+	map<string, AttrValue> attrs = 4;
+};
+```
+
+## OpProto Register
+
+Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`.
+
+```cpp
+class OpProtoMaker {
+public:
+	OpProtoMaker(OpProto* proto): proto_(proto) {}
+protected:
+	OpProto* proto_;
+	void AddInput(const std::string& name, const std::string& desc) {...}
+	void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...}
+	void AddComment(const std::string& comment) { ... }
+};
+
+class OpRegistry {
+public:
+	using OpCreator = std::function<OperatorBase* (OpDesc& desc)>;
+	
+	template <typename OpType, typename OpMaker>
+	static void RegisterOp(const std::string& name) {
+		gCreators_[name] = [](const OpDesc& desc) {
+			return new OpType(desc);
+		};
+		OpProto& opProto = gProtos_[name];
+		OpMaker()(&opProto);
+	}
+
+	static map<string, OpCreator> gCreators_;
+	static map<string, OpProto> gProtos_;
+};
+
+template <typename OpType, typename OpMaker>
+class OpRegister {
+  public:
+    OpRegister(std::string type) {
+        OpRegistry::RegisterOp<OpType, OpMaker>(type);
+    }
+};
+
+#define REGISTER_OP(op_class, op_maker_class, type_name)         \
+    class op_class##Register {                                   \
+      private:                                                   \
+        const static OpRegister<#op_class, #op_maker_class> reg; \
+    };                                                           \
+    const Register op_class##Register::reg(#type_name);
+    
+class CosineOp {
+// ...
+}
+
+struct CosineOpProtoMaker : public OpProtoMaker {
+	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
+		AddInput("input", "input of cosine op");
+		AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0);
+		AddType("cos");
+		AddComment("This is cos op");
+	}
+}
+
+REGISTER_OP(CosineOp, CosineOpProtoMaker, cos);
+```
+
+In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. 
+
+## Python API
+
+Python  APIs are divided into two types, high-level API and low-level API.
+
+### High-Level API
+
+High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs.
+
+Here is a sample about how a define a fc layer:
+
+```python
+hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid");
+```
+
+`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input.
+
+The definition of `fc_layer()`:
+
+```python
+def fc_layer(input, size, with_bias, activation):
+	attr_map = {"size":size}
+	check_attrs(attr_map)
+	w = make_variable('w')
+	if with_bias:
+		b = make_variable('b')
+	else:
+		b = None
+	fc_output = make_variable('fc_output');
+	fc_op(input, w, b, fc_output, attr_map)
+	act_output = make_variable('sigmod_output');
+	if activation == "sigmod":
+		sigmod_op(fc_output, act_output);
+	elif:
+		# ...
+	return act_output;
+``` 
+
+### Low Leval API
+
+In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code.
+
+*TODO*

From 211f83fa2257716421f7db0431a5e707e788773a Mon Sep 17 00:00:00 2001
From: zlx <zlx_hg@163.com>
Date: Tue, 4 Jul 2017 17:05:25 +0800
Subject: [PATCH 003/100] set depthwise conv layer interface in python

---
 python/paddle/trainer/config_parser.py        | 57 ++++++++++++
 .../paddle/trainer_config_helpers/layers.py   | 90 +++++++++++++++++++
 2 files changed, 147 insertions(+)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index b7418101d8..2965c922fa 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1741,6 +1741,59 @@ class ParameterReluLayer(LayerBase):
         self.create_input_parameter(0, input_layer.size / partial_sum)
 
 
+@config_layer('depthwise_conv')
+class DepthwiseConvLayer(LayerBase):
+    layer_type = 'depthwise_conv'
+
+    def __init__(self,
+                 name,
+                 inputs=[],
+                 bias=True,
+                 num_filters=None,
+                 shared_biases=False,
+                 **xargs):
+        super(DepthwiseConvLayer, self).__init__(
+            name, self.layer_type, 0, inputs=inputs, **xargs)
+
+        if num_filters is not None:
+            self.config.num_filters = num_filters
+
+        use_gpu = int(g_command_config_args.get("use_gpu", 0))
+        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
+
+        # Automatically select cudnn_type for GPU and exconv for CPU
+        # if set type=conv, but still reserve the way user specify
+        # exconv or cudnn_conv manually.
+        self.layer_type = "depthwise_conv"
+        # need to specify layer in config
+        self.config.type = self.layer_type
+
+        if shared_biases is not None:
+            self.config.shared_biases = shared_biases
+
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            conv_conf = self.config.inputs[input_index].conv_conf
+            #set the groups
+            self.inputs[input_index].conv.groups = self.inputs[
+                input_index].conv.channels
+            parse_conv(self.inputs[input_index].conv, input_layer.name,
+                       conv_conf, num_filters)
+            psize = self.calc_parameter_size(conv_conf)
+            self.create_input_parameter(input_index, psize)
+            self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x,
+                               self.config.num_filters)
+
+        psize = self.config.size
+        if shared_biases:
+            psize = self.config.num_filters
+        self.create_bias_parameter(bias, psize, [psize, 1])
+
+    def calc_parameter_size(self, conv_conf):
+        return self.config.num_filters * conv_conf.filter_channels \
+                    * (conv_conf.filter_size * conv_conf.filter_size_y)
+
+
 @config_layer('conv')
 class ConvLayerBase(LayerBase):
     layer_type = 'conv'
@@ -3145,6 +3198,10 @@ def ParameterHook(type, **kwargs):
         if sparsity_ratio is not None:
             hook.sparsity_ratio = sparsity_ratio
         return hook
+    elif type == 'dpruning':
+        hook = ParameterUpdaterHookConfig()
+        hook.type = type
+        return hook
     else:
         return None
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index a601d5c84a..073e853bc2 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -57,6 +57,7 @@ __all__ = [
     'classification_cost',
     'LayerOutput',
     'img_conv_layer',
+    'img_depthwise_conv_layer',
     'img_pool_layer',
     'batch_norm_layer',
     'img_cmrnorm_layer',
@@ -148,6 +149,7 @@ class LayerType(object):
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = 'conv'
     CONVTRANS_LAYER = 'convt'
+    DEPTHWISE_CONV_LAYER = 'depthwise_conv'
     EXCONV_LAYER = 'exconv'
     EXCONVTRANS_LAYER = 'exconvt'
     CUDNNCONV_LAYER = 'cudnn_conv'
@@ -2085,6 +2087,94 @@ def hsigmoid(input,
         name, LayerType.HSIGMOID, parents=parents, size=l.config.size)
 
 
+@wrap_name_default("depthwise_conv")
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+@wrap_act_default(act=ReluActivation())
+@layer_support(DROPOUT)
+def img_depthwise_conv_layer(input,
+                             filter_size,
+                             num_filters,
+                             name=None,
+                             num_channels=None,
+                             act=None,
+                             groups=1,
+                             stride=1,
+                             padding=0,
+                             bias_attr=None,
+                             param_attr=None,
+                             shared_biases=True,
+                             layer_attr=None,
+                             filter_size_y=None,
+                             stride_y=None,
+                             padding_y=None,
+                             trans=False,
+                             layer_type=None):
+
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if filter_size_y is None:
+        if isinstance(filter_size, collections.Sequence):
+            assert len(filter_size) == 2
+            filter_size, filter_size_y = filter_size
+        else:
+            filter_size_y = filter_size
+
+    if stride_y is None:
+        if isinstance(stride, collections.Sequence):
+            assert len(stride) == 2
+            stride, stride_y = stride
+        else:
+            stride_y = stride
+
+    if padding_y is None:
+        if isinstance(padding, collections.Sequence):
+            assert len(padding) == 2
+            padding, padding_y = padding
+        else:
+            padding_y = padding
+
+    if param_attr.attr.get('initial_smart'):
+        # special initial for conv layers.
+        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
+        param_attr.attr["initial_mean"] = 0.0
+        param_attr.attr["initial_std"] = init_w
+        param_attr.attr["initial_strategy"] = 0
+        param_attr.attr["initial_smart"] = False
+
+    lt = LayerType.DEPTHWISE_CONV_LAYER
+
+    l = Layer(
+        name=name,
+        inputs=Input(
+            input.name,
+            conv=Conv(
+                filter_size=filter_size,
+                padding=padding,
+                stride=stride,
+                channels=num_channels,
+                groups=groups,
+                filter_size_y=filter_size_y,
+                padding_y=padding_y,
+                stride_y=stride_y),
+            **param_attr.attr),
+        active_type=act.name,
+        num_filters=num_filters,
+        bias=ParamAttr.to_bias(bias_attr),
+        shared_biases=shared_biases,
+        type=lt,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        lt,
+        parents=[input],
+        activation=act,
+        num_filters=num_filters,
+        size=l.config.size)
+
+
 @wrap_name_default("conv")
 @wrap_param_attr_default()
 @wrap_bias_attr_default()

From eeb17c26fdfed5d3cb157ceabf0a89ec93329414 Mon Sep 17 00:00:00 2001
From: zlx <zlx_hg@163.com>
Date: Tue, 4 Jul 2017 17:06:25 +0800
Subject: [PATCH 004/100] add depthwise operation and depthwise conv layer

---
 paddle/function/DepthwiseConvOp.cpp          | 308 +++++++++++++++++++
 paddle/function/DepthwiseConvOp.h            |  91 ++++++
 paddle/function/DepthwiseConvOpGpu.cu        | 295 ++++++++++++++++++
 paddle/gserver/layers/DepthwiseConvLayer.cpp | 165 ++++++++++
 paddle/gserver/layers/DepthwiseConvLayer.h   |  52 ++++
 5 files changed, 911 insertions(+)
 create mode 100644 paddle/function/DepthwiseConvOp.cpp
 create mode 100644 paddle/function/DepthwiseConvOp.h
 create mode 100644 paddle/function/DepthwiseConvOpGpu.cu
 create mode 100644 paddle/gserver/layers/DepthwiseConvLayer.cpp
 create mode 100644 paddle/gserver/layers/DepthwiseConvLayer.h

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
new file mode 100644
index 0000000000..ad332d2931
--- /dev/null
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -0,0 +1,308 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "GemmFunctor.h"
+#include "paddle/math/MemoryHandle.h"
+
+namespace paddle {
+
+/*
+ * imData = [input_channels, input_height, input_width]
+ * colData = [input_channels, filter_height, filter_width,
+ *            output_height, output_width]
+ */
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(int outputSize,
+                  const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    // NO_IMPLEMENTATION
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(int inputSize,
+                  const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {}
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(int num_i,
+                  int colDataSize,
+                  const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* multiplierData,
+                  T* filterGrad) {}
+};
+
+/*
+ * \brief Forward calculation of convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    // size_t inputChannels = input[1];
+    // size_t inputHeight = input[2];
+    // size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    size_t outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
+    DepthwiseConvFunctor<Device, real> depthwiseConv;
+    depthwiseConv(outputSize,
+                  inputData,
+                  filterData,
+                  batchSize,
+                  outputChannels,
+                  outputHeight,
+                  outputWidth,
+                  filterHeight,
+                  filterWidth,
+                  strideH(),
+                  strideW(),
+                  paddingH(),
+                  paddingW(),
+                  outputData);
+  }
+};
+
+/*
+ * \brief Backward input calculation of convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+
+    size_t inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+
+    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
+    depthwiseConvGradInput(inputSize,
+                           outputGrad,
+                           filterData,
+                           batchSize,
+                           outputChannels,
+                           outputHeight,
+                           outputWidth,
+                           inputHeight,
+                           inputWidth,
+                           filterHeight,
+                           filterWidth,
+                           strideH(),
+                           strideW(),
+                           paddingH(),
+                           paddingW(),
+                           inputGrad);
+  }
+};
+
+/*
+ * \brief Backward filter calculation of convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    // const TensorShape& multiplier = inputs[2].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* multiplierData = inputs[2].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+
+    size_t size =
+        inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
+
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      depthwiseConvGradFilter(i,
+                              size,
+                              outputGrad,
+                              inputData,
+                              batchSize,
+                              outputChannels,
+                              outputHeight,
+                              outputWidth,
+                              inputHeight,
+                              inputWidth,
+                              filterHeight,
+                              filterWidth,
+                              strideH(),
+                              strideW(),
+                              paddingH(),
+                              paddingW(),
+                              colData,
+                              multiplierData,
+                              filterGrad);
+    }
+  }
+};
+
+REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    CPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    CPU,
+                    DepthwiseConvGradFilterFunction);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    GPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    GPU,
+                    DepthwiseConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
new file mode 100644
index 0000000000..8af1db974d
--- /dev/null
+++ b/paddle/function/DepthwiseConvOp.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvOp.h"
+
+namespace paddle {
+
+/*
+ * imData = [input_channels, input_height, input_width]
+ * colData = [input_channels, filter_height, filter_width,
+ *            output_height, output_width]
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvFunctor {
+public:
+  void operator()(int outputSize,
+                  const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData);
+};
+
+template <DeviceType Device, class T>
+class DepthwiseConvGradInputFunctor {
+public:
+  void operator()(int inputSize,
+                  const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad);
+};
+
+template <DeviceType Device, class T>
+class DepthwiseConvGradFilterFunctor {
+public:
+  void operator()(int num_i,
+                  int colDataSize,
+                  const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* multiplierData,
+                  T* filterGrad);
+
+};  // namespace paddle
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
new file mode 100644
index 0000000000..1b2d5d99ed
--- /dev/null
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -0,0 +1,295 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+#include "DepthwiseConvOp.h"
+
+namespace paddle {
+template <class T>
+__global__ void ConvolutionDepthwiseWeightForward(const int nthreads,
+    const T* const bottom_data, const T* const weight_data,
+    const int num, const int channels, const int top_height,
+    const int top_width, const int bottom_height, const int bottom_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, T* const top_data) {
+
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  
+  if(index < nthreads) {
+    const int n = index / channels / top_height / top_width;
+    const int c = (index / top_height / top_width) % channels;
+    const int h = (index / top_width) % top_height;
+    const int w = index % top_width;
+    const T* weight = weight_data + c * kernel_h * kernel_w;
+    T value = 0;
+    for (int kh = 0; kh < kernel_h; ++kh) {
+      for (int kw = 0; kw < kernel_w; ++kw) {
+        const int h_in = -pad_h + h * stride_h + kh * dilation_h;
+        const int w_in = -pad_w + w * stride_w + kw * dilation_w;
+        if ((h_in >= 0) && (h_in < bottom_height)
+              && (w_in >= 0) && (w_in < bottom_width)) {
+          const int offset = ((n * channels + c) * bottom_height + h_in)
+                * bottom_width + w_in;
+          value += (*weight) * bottom_data[offset];
+        }
+        ++weight;
+      }
+    }
+    top_data[index] = value;
+  }
+}
+
+template <class T>
+__global__ void ConvolutionDepthwiseBottomBackward(const int nthreads,
+    const T* const top_diff, const T* const weight_data,
+    const int num, const int channels, const int top_height,
+    const int top_width, const int bottom_height, const int bottom_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, T* const bottom_diff) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    const int n = index / channels / bottom_height / bottom_width;
+    const int c = (index / bottom_height / bottom_width) % channels;
+    const int h = (index / bottom_width) % bottom_height;
+    const int w = index % bottom_width;
+    const T* weight = weight_data + c * kernel_h * kernel_w;
+    T value = 0;
+    for (int kh = 0; kh < kernel_h; ++kh) {
+      for (int kw = 0; kw < kernel_w; ++kw) {
+        const int h_out_s = h + pad_h - kh * dilation_h;
+        const int w_out_s = w + pad_w - kw * dilation_w;
+        if (((h_out_s % stride_h) == 0) && ((w_out_s % stride_w) == 0)) {
+          const int h_out = h_out_s / stride_h;
+          const int w_out = w_out_s / stride_w;
+	  //it affect the effectives
+          if ((h_out >= 0) && (h_out < top_height)
+                && (w_out >= 0) && (w_out < top_width)) {
+            const int offset = ((n * channels + c) * top_height + h_out)
+                  * top_width + w_out;
+            value += (*weight) * top_diff[offset];
+          }
+        }
+        ++weight;
+      }
+    }
+    bottom_diff[index] += value;
+  }
+}
+
+template <class T>
+__global__ void ConvolutionDepthwiseWeightBackward(const int num_i, const int nthreads,
+    const T* const top_diff, const T* const bottom_data,
+    const int num, const int channels, const int top_height,
+    const int top_width, const int bottom_height, const int bottom_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, T* const buffer_data) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h = (index / top_width) % top_height;
+    const int w = index % top_width;
+    const int kh = (index / kernel_w / top_height / top_width)
+          % kernel_h;
+    const int kw = (index / top_height / top_width) % kernel_w;
+    const int h_in = -pad_h + h * stride_h + kh * dilation_h;
+    const int w_in = -pad_w + w * stride_w + kw * dilation_w;
+    if ((h_in >= 0) && (h_in < bottom_height)
+          && (w_in >= 0) && (w_in < bottom_width)) {
+      const int c = index / kernel_h / kernel_w / top_height / top_width;
+      const int n = num_i;
+      const int top_offset = ((n * channels + c) * top_height + h)
+            * top_width + w;
+      const int bottom_offset = ((n * channels + c) * bottom_height + h_in)
+            * bottom_width + w_in;
+      buffer_data[index] = top_diff[top_offset] * bottom_data[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
+public:
+  void operator()(int outputSize, 
+            const T* inputData, 
+            const T* filterData,
+            int batchSize,
+            int outputChannels,
+            int outputHeight,
+            int outputWidth,
+            int filterHeight,
+            int filterWidth,
+            int strideH,
+            int strideW,
+            int paddingH,
+            int paddingW,
+            T* outputData){
+
+    size_t blocks = (outputSize + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    
+    ConvolutionDepthwiseWeightForward<T>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+            outputSize, 
+            inputData, 
+            filterData,
+            batchSize,
+            outputChannels,
+            outputHeight,
+            outputWidth,
+            filterHeight,
+            filterWidth,
+            strideH,
+            strideW,
+            paddingH,
+            paddingW,
+            outputData);
+    }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
+public:
+  void operator()(int inputSize,
+            const T* outputGrad,
+            const T* filterData,
+            int batchSize,
+            int outputChannels,
+            int outputHeight,
+            int outputWidth,
+            int inputHeight,
+            int inputWidth,
+            int filterHeight,
+            int filterWidth,
+            int strideH,
+            int strideW,
+            int paddingH,
+            int paddingW,
+                T* inputGrad){
+
+    size_t blocks = (inputSize + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseBottomBackward<T>
+          // NOLINT_NEXT_LINE(whitespace/operators)
+        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+            inputSize,
+            outputGrad,
+            filterData,
+            batchSize,
+            outputChannels,
+            outputHeight,
+            outputWidth,
+            inputHeight,
+            inputWidth,
+            filterHeight,
+            filterWidth,
+            strideH,
+            strideW,
+            paddingH,
+            paddingW,
+            inputGrad);
+    }
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(int num_i,
+                int colDataSize,
+                const T* outputGrad,
+                const T* inputData,
+                int batchSize,
+                int outputChannels,
+                int outputHeight,
+                int outputWidth,
+                int inputHeight,
+                int inputWidth,
+                int filterHeight,
+                int filterWidth,
+                int strideH,
+                int strideW,
+                int paddingH,
+                int paddingW,
+                T* colData,
+                T* multiplierData,
+                T* filterGrad){
+
+        size_t blocks = (colDataSize + 1024 -1) / 1024;
+        size_t blockX = 512;
+        size_t blockY = (blocks+512-1)/512;
+        dim3 threads(1024, 1);
+        dim3 grid(blockX, blockY);
+
+	    ConvolutionDepthwiseWeightBackward<T>
+            <<< grid, threads, 0, STREAM_DEFAULT >>>(
+                i,
+                size,
+                outputGrad,
+                inputData,
+                batchSize,
+                outputChannels,
+                outputHeight,
+                outputWidth,
+                inputHeight,
+                inputWidth,
+                filterHeight,
+                filterWidth,
+                strideH,
+                strideW,
+                paddingH,
+                paddingW,
+                colData
+            );
+        GemmFunctor<Device, real> gemm;
+        int M = size / outputHeight / outputWidth;
+        int N = 1;
+        int K = outputHeight * outputWidth;
+        gemm(CblasNoTrans,
+            CblasNoTrans,
+            M,
+            N,
+            K,
+            1.0f,
+            colData,
+            K,
+            multiplierData,
+            N,
+            1.0f,
+            filterGrad,
+            N);
+        //gemv
+    }
+};
+
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.cpp b/paddle/gserver/layers/DepthwiseConvLayer.cpp
new file mode 100644
index 0000000000..9df8a9df7c
--- /dev/null
+++ b/paddle/gserver/layers/DepthwiseConvLayer.cpp
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/*
+ * The calculation of the exconvt(convolution transpose (deconv) operation)
+ * is a swap of forward and backward of the calculation of exconv.
+ * */
+REGISTER_LAYER(depthwise_conv, DepthwiseConvLayer);
+
+bool DepthwiseConvLayer::init(const LayerMap &layerMap,
+                              const ParameterMap &parameterMap) {
+  /* Initialize the basic convolutional parent class */
+  ExpandConvBaseLayer::init(layerMap, parameterMap);
+
+  size_t numInputs = config_.inputs_size();
+  inputShape_.resize(numInputs);
+  filterShape_.resize(numInputs);
+  outputShape_.resize(numInputs);
+  multiplierShape_.resize(numInputs);
+  weightMultiplier_.resize(numInputs);
+
+  for (int i = 0; i < config_.inputs_size(); i++) {
+    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
+    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    Matrix::resizeOrCreate(weightMultiplier_[i],
+                           (size_t)outputH_[i] * (size_t)outputW_[i],
+                           (size_t)1,
+                           false,
+                           useGpu_);
+    weightMultiplier_[i]->one();
+    createFunction(forward_,
+                   "DepthwiseConv",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+
+    createFunction(backward_,
+                   "DepthwiseConvGradInput",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+
+    createFunction(backward_,
+                   "DepthwiseConvGradFilter",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+  }
+  return true;
+}
+
+// i is the index of input layers
+#define BACKWARD_INPUT(i, inputs, outputs) \
+  backward_[2 * i]->calc(inputs, outputs)
+#define BACKWARD_FILTER(i, inputs, outputs) \
+  backward_[2 * i + 1]->calc(inputs, outputs)
+
+void DepthwiseConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  resetOutput(batchSize, getOutputSize());
+
+  // Calculate the shape of the input, output, and filter.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    inputShape_[i] = TensorShape({(size_t)batchSize,
+                                  (size_t)channels_[i],
+                                  (size_t)imgSizeH_[i],
+                                  (size_t)imgSizeW_[i]});
+    multiplierShape_[i] =
+        TensorShape({(size_t)outputH_[i] * (size_t)outputW_[i], (size_t)1});
+    filterShape_[i] = TensorShape({(size_t)groups_[i],
+                                   (size_t)numFilters_ / groups_[i],
+                                   (size_t)channels_[i] / groups_[i],
+                                   (size_t)filterSizeY_[i],
+                                   (size_t)filterSize_[i]});
+    outputShape_[i] = TensorShape({(size_t)batchSize,
+                                   (size_t)numFilters_,
+                                   (size_t)outputH_[i],
+                                   (size_t)outputW_[i]});
+  }
+
+  // Calculate the output value.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(i), inputShape_[i]);
+    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+    outputs.addArg(
+        *getOutputValue(), outputShape_[i], i == 0 ? ASSIGN_TO : ADD_TO);
+
+    forward_[i]->calc(inputs, outputs);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get()) {
+    if (sharedBiases_) {
+      addSharedBias();
+    } else {
+      addUnsharedBias();
+    }
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void DepthwiseConvLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  MatrixPtr outGrad = getOutputGrad();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases(outGrad);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  // Calculate the input grad and filter grad.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    if (getInputGrad(i)) {
+      BufferArgs inputs;
+      BufferArgs outputs;
+      inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
+      BACKWARD_INPUT(i, inputs, outputs);
+    }
+
+    if (weights_[i]->getWGrad()) {
+      BufferArgs inputs;
+      BufferArgs outputs;
+      inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      inputs.addArg(*getInputValue(i), inputShape_[i]);
+      inputs.addArg(*weightMultiplier_[i], multiplierShape_[i]);
+      // weight_multiplier
+      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
+      BACKWARD_FILTER(i, inputs, outputs);
+
+      /* Increasing the number of gradient */
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.h b/paddle/gserver/layers/DepthwiseConvLayer.h
new file mode 100644
index 0000000000..61dd87c12a
--- /dev/null
+++ b/paddle/gserver/layers/DepthwiseConvLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ExpandConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ *
+ * The config file api is img_conv_layer.
+ */
+
+class DepthwiseConvLayer : public ExpandConvBaseLayer {
+public:
+  explicit DepthwiseConvLayer(const LayerConfig& config)
+      : ExpandConvBaseLayer(config) {}
+
+  ~DepthwiseConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+protected:
+  std::vector<TensorShape> inputShape_;
+  std::vector<TensorShape> filterShape_;
+  std::vector<TensorShape> outputShape_;
+  std::vector<TensorShape> multiplierShape_;
+  std::vector<MatrixPtr> weightMultiplier_;
+};
+
+}  // namespace paddle

From efae51ce240e83daff7d2042e14f7710286e9827 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Fri, 7 Jul 2017 21:36:02 +0800
Subject: [PATCH 005/100] add the mobilenet gpu acceleration, cpu is in the
 process

---
 paddle/function/DepthwiseConvOp.cpp          |  19 +-
 paddle/function/DepthwiseConvOp.h            |   7 +-
 paddle/function/DepthwiseConvOpGpu.cu        | 201 +++++++++++--------
 paddle/gserver/layers/ConvBaseLayer.cpp      |   3 +-
 paddle/gserver/layers/DepthwiseConvLayer.cpp |   2 +
 5 files changed, 130 insertions(+), 102 deletions(-)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index ad332d2931..d4272c72f2 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -18,11 +18,6 @@ limitations under the License. */
 
 namespace paddle {
 
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
 public:
@@ -33,6 +28,8 @@ public:
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputHeight,
+                  int inputWidth,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
@@ -40,7 +37,7 @@ public:
                   int paddingH,
                   int paddingW,
                   T* outputData) {
-    // NO_IMPLEMENTATION
+    // TODO(zhaolong) : cpu implementation of depthwise convolution
   }
 };
 
@@ -118,8 +115,8 @@ public:
 
     size_t batchSize = input[0];
     // size_t inputChannels = input[1];
-    // size_t inputHeight = input[2];
-    // size_t inputWidth = input[3];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
     size_t filterHeight = getFilterHeight(filter);
     size_t filterWidth = getFilterWidth(filter);
     size_t outputChannels = output[1];
@@ -139,6 +136,8 @@ public:
                   outputChannels,
                   outputHeight,
                   outputWidth,
+                  inputHeight,
+                  inputWidth,
                   filterHeight,
                   filterWidth,
                   strideH(),
@@ -233,8 +232,8 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
+    // CHECK_EQ(numInputs_, inputs.size());
+    // CHECK_EQ(numOutputs_, outputs.size());
     check(inputs, outputs);
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 8af1db974d..44290682de 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -18,11 +18,6 @@ limitations under the License. */
 
 namespace paddle {
 
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
 template <DeviceType Device, class T>
 class DepthwiseConvFunctor {
 public:
@@ -33,6 +28,8 @@ public:
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputHeight,
+                  int intputWidth,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 1b2d5d99ed..08fe9221ac 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -14,73 +14,95 @@ limitations under the License. */
 
 #include "ConvOp.h"
 #include "DepthwiseConvOp.h"
+#include "GemmFunctor.h"
+#include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
 template <class T>
-__global__ void ConvolutionDepthwiseWeightForward(const int nthreads,
-    const T* const bottom_data, const T* const weight_data,
-    const int num, const int channels, const int top_height,
-    const int top_width, const int bottom_height, const int bottom_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    const int dilation_h, const int dilation_w, T* const top_data) {
+__global__ 
+void ConvolutionDepthwiseForward(const int nthreads,
+    const T* const inputData, const T* const filterData,
+    const int batchSize, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputHeight, const int inputWidth,
+    const int filterHeight, const int filterWidth, const int strideH,
+    const int strideW, const int paddingH, const int paddingW,
+    T* const outputData) {
 
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   
   if(index < nthreads) {
-    const int n = index / channels / top_height / top_width;
-    const int c = (index / top_height / top_width) % channels;
-    const int h = (index / top_width) % top_height;
-    const int w = index % top_width;
-    const T* weight = weight_data + c * kernel_h * kernel_w;
+    const int n = index / outputChannels / outputHeight / outputWidth;
+    const int c = (index / outputHeight / outputWidth) % outputChannels;
+    const int h = (index / outputWidth) % outputHeight;
+    const int w = index % outputWidth;
+    const T* weight = filterData + c * filterHeight * filterWidth;
     T value = 0;
-    for (int kh = 0; kh < kernel_h; ++kh) {
-      for (int kw = 0; kw < kernel_w; ++kw) {
-        const int h_in = -pad_h + h * stride_h + kh * dilation_h;
-        const int w_in = -pad_w + w * stride_w + kw * dilation_w;
-        if ((h_in >= 0) && (h_in < bottom_height)
-              && (w_in >= 0) && (w_in < bottom_width)) {
-          const int offset = ((n * channels + c) * bottom_height + h_in)
-                * bottom_width + w_in;
-          value += (*weight) * bottom_data[offset];
-        }
-        ++weight;
-      }
-    }
-    top_data[index] = value;
+	const int h_in_start = -paddingH + h * strideH;
+	const int w_in_start = -paddingW + w * strideW;
+	const int h_in_end = -paddingH + h * strideH + filterHeight - 1;
+	const int w_in_end = -paddingW + w * strideW + filterWidth - 1;
+    if ((h_in_start >= 0) && (h_in_end < inputHeight) 
+		 &&(w_in_start >= 0) && (w_in_end < inputWidth)) {
+		for (int kh = 0; kh < filterHeight; ++kh) {
+		  for (int kw = 0; kw < filterWidth; ++kw) {
+			const int h_in = -paddingH + h * strideH + kh;
+			const int w_in = -paddingW + w * strideW + kw;
+			  const int offset = ((n * outputChannels + c) * inputHeight + h_in)
+					* inputWidth + w_in;
+			  value += (*weight) * inputData[offset];
+			++weight;
+		  }
+		}
+	}else{
+		for (int kh = 0; kh < filterHeight; ++kh) {
+		  for (int kw = 0; kw < filterWidth; ++kw) {
+			const int h_in = -paddingH + h * strideH + kh;
+			const int w_in = -paddingW + w * strideW + kw;
+			if ((h_in >= 0) && (h_in < inputHeight)
+				  && (w_in >= 0) && (w_in < inputWidth)) {
+			  const int offset = ((n * outputChannels + c) * inputHeight + h_in)
+					* inputWidth + w_in;
+			  value += (*weight) * inputData[offset];
+			}
+			++weight;
+		  }
+		}
+	}
+    outputData[index] = value;
   }
 }
 
 template <class T>
-__global__ void ConvolutionDepthwiseBottomBackward(const int nthreads,
+__global__
+void ConvolutionDepthwiseInputBackward(const int nthreads,
     const T* const top_diff, const T* const weight_data,
-    const int num, const int channels, const int top_height,
-    const int top_width, const int bottom_height, const int bottom_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    const int dilation_h, const int dilation_w, T* const bottom_diff) {
+    const int num, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputHeight, const int inputWidth,
+    const int filterHeight, const int filterWidth, const int strideH,
+    const int strideW, const int paddingH, const int paddingW,
+     T* const bottom_diff) {
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   if(index < nthreads) {
-    const int n = index / channels / bottom_height / bottom_width;
-    const int c = (index / bottom_height / bottom_width) % channels;
-    const int h = (index / bottom_width) % bottom_height;
-    const int w = index % bottom_width;
-    const T* weight = weight_data + c * kernel_h * kernel_w;
+    const int n = index / outputChannels / inputHeight / inputWidth;
+    const int c = (index / inputHeight / inputWidth) % outputChannels;
+    const int h = (index / inputWidth) % inputHeight;
+    const int w = index % inputWidth;
+    const T* weight = weight_data + c * filterHeight * filterWidth;
     T value = 0;
-    for (int kh = 0; kh < kernel_h; ++kh) {
-      for (int kw = 0; kw < kernel_w; ++kw) {
-        const int h_out_s = h + pad_h - kh * dilation_h;
-        const int w_out_s = w + pad_w - kw * dilation_w;
-        if (((h_out_s % stride_h) == 0) && ((w_out_s % stride_w) == 0)) {
-          const int h_out = h_out_s / stride_h;
-          const int w_out = w_out_s / stride_w;
-	  //it affect the effectives
-          if ((h_out >= 0) && (h_out < top_height)
-                && (w_out >= 0) && (w_out < top_width)) {
-            const int offset = ((n * channels + c) * top_height + h_out)
-                  * top_width + w_out;
+    for (int kh = 0; kh < filterHeight; ++kh) {
+      for (int kw = 0; kw < filterWidth; ++kw) {
+        const int h_out_s = h + paddingH - kh;
+        const int w_out_s = w + paddingW - kw;
+        if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
+          const int h_out = h_out_s / strideH;
+          const int w_out = w_out_s / strideW;
+	     // TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
+          if ((h_out >= 0) && (h_out < outputHeight)
+                && (w_out >= 0) && (w_out < outputWidth)) {
+            const int offset = ((n * outputChannels + c) * outputHeight + h_out)
+                  * outputWidth + w_out;
             value += (*weight) * top_diff[offset];
           }
         }
@@ -92,32 +114,33 @@ __global__ void ConvolutionDepthwiseBottomBackward(const int nthreads,
 }
 
 template <class T>
-__global__ void ConvolutionDepthwiseWeightBackward(const int num_i, const int nthreads,
-    const T* const top_diff, const T* const bottom_data,
-    const int num, const int channels, const int top_height,
-    const int top_width, const int bottom_height, const int bottom_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    const int dilation_h, const int dilation_w, T* const buffer_data) {
+__global__
+void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
+    const T* const top_diff, const T* const inputData,
+    const int num, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputHeight, const int inputWidth,
+    const int filterHeight, const int filterWidth, const int strideH,
+    const int strideW, const int paddingH, const int paddingW,
+    T* const buffer_data) {
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   if (index < nthreads) {
-    const int h = (index / top_width) % top_height;
-    const int w = index % top_width;
-    const int kh = (index / kernel_w / top_height / top_width)
-          % kernel_h;
-    const int kw = (index / top_height / top_width) % kernel_w;
-    const int h_in = -pad_h + h * stride_h + kh * dilation_h;
-    const int w_in = -pad_w + w * stride_w + kw * dilation_w;
-    if ((h_in >= 0) && (h_in < bottom_height)
-          && (w_in >= 0) && (w_in < bottom_width)) {
-      const int c = index / kernel_h / kernel_w / top_height / top_width;
+    const int h = (index / outputWidth) % outputHeight;
+    const int w = index % outputWidth;
+    const int kh = (index / filterWidth / outputHeight / outputWidth)
+          % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h * strideH + kh;
+    const int w_in = -paddingW + w * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight)
+          && (w_in >= 0) && (w_in < inputWidth)) {
+      const int c = index / filterHeight / filterWidth / outputHeight / outputWidth;
       const int n = num_i;
-      const int top_offset = ((n * channels + c) * top_height + h)
-            * top_width + w;
-      const int bottom_offset = ((n * channels + c) * bottom_height + h_in)
-            * bottom_width + w_in;
-      buffer_data[index] = top_diff[top_offset] * bottom_data[bottom_offset];
+      const int top_offset = ((n * outputChannels + c) * outputHeight + h)
+            * outputWidth + w;
+      const int bottom_offset = ((n * outputChannels + c) * inputHeight + h_in)
+            * inputWidth + w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
     } else {
       buffer_data[index] = 0;
     }
@@ -134,6 +157,8 @@ public:
             int outputChannels,
             int outputHeight,
             int outputWidth,
+			int inputHeight,
+			int inputWidth,
             int filterHeight,
             int filterWidth,
             int strideH,
@@ -148,7 +173,7 @@ public:
     dim3 threads(1024, 1);
     dim3 grid(blockX, blockY);
     
-    ConvolutionDepthwiseWeightForward<T>
+    ConvolutionDepthwiseForward<T>
         <<< grid, threads, 0, STREAM_DEFAULT >>>(
             outputSize, 
             inputData, 
@@ -157,6 +182,8 @@ public:
             outputChannels,
             outputHeight,
             outputWidth,
+			inputHeight,
+			inputWidth,
             filterHeight,
             filterWidth,
             strideH,
@@ -193,7 +220,7 @@ public:
     dim3 threads(1024, 1);
     dim3 grid(blockX, blockY);
 
-    ConvolutionDepthwiseBottomBackward<T>
+    ConvolutionDepthwiseInputBackward<T>
           // NOLINT_NEXT_LINE(whitespace/operators)
         <<< grid, threads, 0, STREAM_DEFAULT >>>(
             inputSize,
@@ -244,10 +271,10 @@ public:
         dim3 threads(1024, 1);
         dim3 grid(blockX, blockY);
 
-	    ConvolutionDepthwiseWeightBackward<T>
+	    ConvolutionDepthwiseFilterBackward<T>
             <<< grid, threads, 0, STREAM_DEFAULT >>>(
-                i,
-                size,
+                num_i,
+                colDataSize,
                 outputGrad,
                 inputData,
                 batchSize,
@@ -264,8 +291,8 @@ public:
                 paddingW,
                 colData
             );
-        GemmFunctor<Device, real> gemm;
-        int M = size / outputHeight / outputWidth;
+        GemmFunctor<DEVICE_TYPE_GPU, real> gemm;
+        int M = colDataSize / outputHeight / outputWidth;
         int N = 1;
         int K = outputHeight * outputWidth;
         gemm(CblasNoTrans,
@@ -273,23 +300,25 @@ public:
             M,
             N,
             K,
-            1.0f,
+            (T)1.0,
             colData,
             K,
             multiplierData,
             N,
-            1.0f,
+            (T)1.0,
             filterGrad,
             N);
         //gemv
     }
 };
 
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+#ifdef PADDLE_TYPE_DOUBLE
+using real=double;
+#else 
+using real=float;
+#endif
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, real>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, real>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, real>;
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index e161d89c38..765c627c30 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -21,7 +21,8 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
   Layer::init(layerMap, parameterMap);
-  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
+  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv" ||
+               config_.type() == "depthwise_conv")
                   ? false
                   : true;
 
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.cpp b/paddle/gserver/layers/DepthwiseConvLayer.cpp
index 9df8a9df7c..f07100d949 100644
--- a/paddle/gserver/layers/DepthwiseConvLayer.cpp
+++ b/paddle/gserver/layers/DepthwiseConvLayer.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "DepthwiseConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
+#include <iostream>
 
 namespace paddle {
 
@@ -79,6 +80,7 @@ void DepthwiseConvLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  // std::cout << "outputSize" << getOutputSize() <<std::endl;
   resetOutput(batchSize, getOutputSize());
 
   // Calculate the shape of the input, output, and filter.

From f4e7ae5540448419628bbff2e967bd1711f436d8 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Fri, 7 Jul 2017 21:37:12 +0800
Subject: [PATCH 006/100] add mobilenet gpu grad test, the test is ok

---
 paddle/gserver/tests/test_LayerGrad.cpp | 46 +++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 297756025b..f3b5813a28 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -347,6 +347,52 @@ TEST(Layer, CosSimVecMatLayer) {
   }
 }
 
+void testDepthwiseConvLayer(const string& type, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 96});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(16);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
+}
+
+TEST(Layer, depthwiseConvLayer) {
+#ifndef PADDLE_ONLY_CPU
+  testDepthwiseConvLayer("depthwise_conv", /* useGpu= */ true);
+#endif
+}
+
 void testConvLayer(const string& type, bool trans, bool useGpu) {
   TestConfig config;
   config.biasSize = 16;

From 064dc888eff95ea2de08684796f56944ad7055d7 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Mon, 10 Jul 2017 16:59:45 +0800
Subject: [PATCH 007/100] add the comments for .h file and code tiny modify

---
 paddle/function/DepthwiseConvOp.cpp          |  73 +++++------
 paddle/function/DepthwiseConvOp.h            |  84 +++++++++++--
 paddle/function/DepthwiseConvOpGpu.cu        | 125 ++++++++++---------
 paddle/gserver/layers/DepthwiseConvLayer.cpp |   9 +-
 paddle/gserver/layers/DepthwiseConvLayer.h   |   6 +-
 5 files changed, 180 insertions(+), 117 deletions(-)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index d4272c72f2..8dcd32b067 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "DepthwiseConvOp.h"
+#include "ConvOp.h"
 #include "GemmFunctor.h"
-#include "paddle/math/MemoryHandle.h"
+//#include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
 
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
 public:
-  void operator()(int outputSize,
-                  const T* inputData,
+  void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
                   int outputChannels,
@@ -44,13 +44,13 @@ public:
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
 public:
-  void operator()(int inputSize,
-                  const T* outputGrad,
+  void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
@@ -65,14 +65,13 @@ public:
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
 public:
-  void operator()(int num_i,
-                  int colDataSize,
-                  const T* outputGrad,
+  void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
@@ -87,7 +86,7 @@ public:
 };
 
 /*
- * \brief Forward calculation of convolution.
+ * \brief Forward calculation of depthwise convolution.
  */
 template <DeviceType Device>
 class DepthwiseConvFunction : public ConvFunctionBase {
@@ -126,11 +125,9 @@ public:
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* outputData = outputs[0].data<real>();
-    size_t outputSize = batchSize * outputChannels * outputHeight * outputWidth;
 
     DepthwiseConvFunctor<Device, real> depthwiseConv;
-    depthwiseConv(outputSize,
-                  inputData,
+    depthwiseConv(inputData,
                   filterData,
                   batchSize,
                   outputChannels,
@@ -149,7 +146,7 @@ public:
 };
 
 /*
- * \brief Backward input calculation of convolution.
+ * \brief Backward input calculation of depthwise convolution.
  */
 template <DeviceType Device>
 class DepthwiseConvGradInputFunction : public ConvFunctionBase {
@@ -191,16 +188,14 @@ public:
     real* filterData = inputs[1].data<real>();
     real* inputGrad = outputs[0].data<real>();
 
-    size_t inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-
     DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
-    depthwiseConvGradInput(inputSize,
-                           outputGrad,
+    depthwiseConvGradInput(outputGrad,
                            filterData,
                            batchSize,
                            outputChannels,
                            outputHeight,
                            outputWidth,
+                           inputChannels,
                            inputHeight,
                            inputWidth,
                            filterHeight,
@@ -214,7 +209,7 @@ public:
 };
 
 /*
- * \brief Backward filter calculation of convolution.
+ * \brief Backward filter calculation of depthwise convolution.
  */
 template <DeviceType Device>
 class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
@@ -255,35 +250,31 @@ public:
     real* multiplierData = inputs[2].data<real>();
     real* filterGrad = outputs[0].data<real>();
 
-    size_t size =
+    int size =
         inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
-
     resizeBuffer<Device>(size);
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
     DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
 
-    for (size_t i = 0; i < batchSize; i++) {
-      depthwiseConvGradFilter(i,
-                              size,
-                              outputGrad,
-                              inputData,
-                              batchSize,
-                              outputChannels,
-                              outputHeight,
-                              outputWidth,
-                              inputHeight,
-                              inputWidth,
-                              filterHeight,
-                              filterWidth,
-                              strideH(),
-                              strideW(),
-                              paddingH(),
-                              paddingW(),
-                              colData,
-                              multiplierData,
-                              filterGrad);
-    }
+    depthwiseConvGradFilter(outputGrad,
+                            inputData,
+                            batchSize,
+                            outputChannels,
+                            outputHeight,
+                            outputWidth,
+                            inputChannels,
+                            inputHeight,
+                            inputWidth,
+                            filterHeight,
+                            filterWidth,
+                            strideH(),
+                            strideW(),
+                            paddingH(),
+                            paddingW(),
+                            colData,
+                            multiplierData,
+                            filterGrad);
   }
 };
 
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 44290682de..da180b29b0 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -14,15 +14,36 @@ limitations under the License. */
 
 #pragma once
 
-#include "ConvOp.h"
+#include "TensorType.h"
 
 namespace paddle {
 
+/**
+ *\brief   Depthwise convolution forward. The outputData
+ *         of depthwise convolution is same with ExpandConvLayer
+ *         when groups equals inputChannels in ExpandConvLayer.
+ *
+ * \param[in]   inputData         input data.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  outputData        outputData.
+ *
+ */
 template <DeviceType Device, class T>
 class DepthwiseConvFunctor {
 public:
-  void operator()(int outputSize,
-                  const T* inputData,
+  void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
                   int outputChannels,
@@ -39,16 +60,38 @@ public:
                   T* outputData);
 };
 
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
+ *
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  inputGrad         the grad data of input.
+ *
+ */
 template <DeviceType Device, class T>
 class DepthwiseConvGradInputFunctor {
 public:
-  void operator()(int inputSize,
-                  const T* outputGrad,
+  void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
@@ -60,17 +103,42 @@ public:
                   T* inputGrad);
 };
 
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   inputData         inputData.
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[in]   colData           Auxiliary data when calculating filterGrad.
+ *                                size:
+ *inputChannels*filterHeight*filterWidth*outputHeight*outputWidth \param[in]
+ *multiplierData    Auxiliary data when calculating filterGrad. size:
+ *outputHeight * outputWidth. \param[out]
+ *filterGrad        the grad data of filter.
+ *
+ */
 template <DeviceType Device, class T>
 class DepthwiseConvGradFilterFunctor {
 public:
-  void operator()(int num_i,
-                  int colDataSize,
-                  const T* outputGrad,
+  void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 08fe9221ac..df9be80b3f 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "ConvOp.h"
 #include "DepthwiseConvOp.h"
 #include "GemmFunctor.h"
-#include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
+// CUDA kernel to compute the depthwise convolution forward pass
 template <class T>
 __global__ 
 void ConvolutionDepthwiseForward(const int nthreads,
@@ -48,7 +47,7 @@ void ConvolutionDepthwiseForward(const int nthreads,
 		  for (int kw = 0; kw < filterWidth; ++kw) {
 			const int h_in = -paddingH + h * strideH + kh;
 			const int w_in = -paddingW + w * strideW + kw;
-			  const int offset = ((n * outputChannels + c) * inputHeight + h_in)
+			const int offset = ((n * outputChannels + c) * inputHeight + h_in)
 					* inputWidth + w_in;
 			  value += (*weight) * inputData[offset];
 			++weight;
@@ -73,6 +72,7 @@ void ConvolutionDepthwiseForward(const int nthreads,
   }
 }
 
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
 template <class T>
 __global__
 void ConvolutionDepthwiseInputBackward(const int nthreads,
@@ -113,6 +113,7 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
   }
 }
 
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
 template <class T>
 __global__
 void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
@@ -150,15 +151,14 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
 public:
-  void operator()(int outputSize, 
-            const T* inputData, 
+  void operator()(const T* inputData, 
             const T* filterData,
             int batchSize,
             int outputChannels,
             int outputHeight,
             int outputWidth,
-			int inputHeight,
-			int inputWidth,
+            int inputHeight,
+            int inputWidth,
             int filterHeight,
             int filterWidth,
             int strideH,
@@ -167,12 +167,14 @@ public:
             int paddingW,
             T* outputData){
 
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
     size_t blocks = (outputSize + 1024 -1) / 1024;
     size_t blockX = 512;
     size_t blockY = (blocks+512-1)/512;
     dim3 threads(1024, 1);
     dim3 grid(blockX, blockY);
-    
+
     ConvolutionDepthwiseForward<T>
         <<< grid, threads, 0, STREAM_DEFAULT >>>(
             outputSize, 
@@ -182,8 +184,8 @@ public:
             outputChannels,
             outputHeight,
             outputWidth,
-			inputHeight,
-			inputWidth,
+            inputHeight,
+            inputWidth,
             filterHeight,
             filterWidth,
             strideH,
@@ -197,13 +199,13 @@ public:
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
 public:
-  void operator()(int inputSize,
-            const T* outputGrad,
+  void operator()(const T* outputGrad,
             const T* filterData,
             int batchSize,
             int outputChannels,
             int outputHeight,
             int outputWidth,
+            int inputChannels,
             int inputHeight,
             int inputWidth,
             int filterHeight,
@@ -212,7 +214,9 @@ public:
             int strideW,
             int paddingH,
             int paddingW,
-                T* inputGrad){
+            T* inputGrad){
+
+	int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
 
     size_t blocks = (inputSize + 1024 -1) / 1024;
     size_t blockX = 512;
@@ -220,6 +224,7 @@ public:
     dim3 threads(1024, 1);
     dim3 grid(blockX, blockY);
 
+
     ConvolutionDepthwiseInputBackward<T>
           // NOLINT_NEXT_LINE(whitespace/operators)
         <<< grid, threads, 0, STREAM_DEFAULT >>>(
@@ -245,14 +250,13 @@ public:
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
 public:
-  void operator()(int num_i,
-                int colDataSize,
-                const T* outputGrad,
+  void operator()(const T* outputGrad,
                 const T* inputData,
                 int batchSize,
                 int outputChannels,
                 int outputHeight,
                 int outputWidth,
+                int inputChannels,
                 int inputHeight,
                 int inputWidth,
                 int filterHeight,
@@ -265,60 +269,65 @@ public:
                 T* multiplierData,
                 T* filterGrad){
 
+        int colDataSize = inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
+
         size_t blocks = (colDataSize + 1024 -1) / 1024;
         size_t blockX = 512;
         size_t blockY = (blocks+512-1)/512;
         dim3 threads(1024, 1);
         dim3 grid(blockX, blockY);
 
-	    ConvolutionDepthwiseFilterBackward<T>
-            <<< grid, threads, 0, STREAM_DEFAULT >>>(
-                num_i,
-                colDataSize,
-                outputGrad,
-                inputData,
-                batchSize,
-                outputChannels,
-                outputHeight,
-                outputWidth,
-                inputHeight,
-                inputWidth,
-                filterHeight,
-                filterWidth,
-                strideH,
-                strideW,
-                paddingH,
-                paddingW,
-                colData
-            );
-        GemmFunctor<DEVICE_TYPE_GPU, real> gemm;
-        int M = colDataSize / outputHeight / outputWidth;
-        int N = 1;
-        int K = outputHeight * outputWidth;
-        gemm(CblasNoTrans,
-            CblasNoTrans,
-            M,
-            N,
-            K,
-            (T)1.0,
-            colData,
-            K,
-            multiplierData,
-            N,
-            (T)1.0,
-            filterGrad,
-            N);
+        for(int i = 0; i < batchSize; i++) {
+			ConvolutionDepthwiseFilterBackward<T>
+				<<< grid, threads, 0, STREAM_DEFAULT >>>(
+                    i,
+                    colDataSize,
+                    outputGrad,
+                    inputData,
+                    batchSize,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    inputHeight,
+                    inputWidth,
+                    filterHeight,
+                    filterWidth,
+                    strideH,
+                    strideW,
+                    paddingH,
+                    paddingW,
+                    colData
+				);
+			GemmFunctor<DEVICE_TYPE_GPU, real> gemm;
+			int M = colDataSize / outputHeight / outputWidth;
+			int N = 1;
+			int K = outputHeight * outputWidth;
+			gemm(CblasNoTrans,
+				CblasNoTrans,
+				M,
+				N,
+				K,
+				(T)1.0,
+				colData,
+				K,
+				multiplierData,
+				N,
+				(T)1.0,
+				filterGrad,
+				N);
+		}
         //gemv
     }
 };
 
 #ifdef PADDLE_TYPE_DOUBLE
-using real=double;
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
 #else 
-using real=float;
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
 #endif
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, real>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, real>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, real>;
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.cpp b/paddle/gserver/layers/DepthwiseConvLayer.cpp
index f07100d949..8da3a52c24 100644
--- a/paddle/gserver/layers/DepthwiseConvLayer.cpp
+++ b/paddle/gserver/layers/DepthwiseConvLayer.cpp
@@ -15,14 +15,9 @@ limitations under the License. */
 #include "DepthwiseConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include <iostream>
 
 namespace paddle {
 
-/*
- * The calculation of the exconvt(convolution transpose (deconv) operation)
- * is a swap of forward and backward of the calculation of exconv.
- * */
 REGISTER_LAYER(depthwise_conv, DepthwiseConvLayer);
 
 bool DepthwiseConvLayer::init(const LayerMap &layerMap,
@@ -76,11 +71,12 @@ bool DepthwiseConvLayer::init(const LayerMap &layerMap,
 #define BACKWARD_FILTER(i, inputs, outputs) \
   backward_[2 * i + 1]->calc(inputs, outputs)
 
+// compute the depthwise convolution forward pass
 void DepthwiseConvLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  // std::cout << "outputSize" << getOutputSize() <<std::endl;
+
   resetOutput(batchSize, getOutputSize());
 
   // Calculate the shape of the input, output, and filter.
@@ -127,6 +123,7 @@ void DepthwiseConvLayer::forward(PassType passType) {
   forwardActivation();
 }
 
+// compute the depthwise convolution backprop.
 void DepthwiseConvLayer::backward(const UpdateCallback &callback) {
   backwardActivation();
 
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.h b/paddle/gserver/layers/DepthwiseConvLayer.h
index 61dd87c12a..c640d13b58 100644
--- a/paddle/gserver/layers/DepthwiseConvLayer.h
+++ b/paddle/gserver/layers/DepthwiseConvLayer.h
@@ -22,10 +22,8 @@ namespace paddle {
 
 /**
  * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- *
- * The config file api is img_conv_layer.
+ * This layer do the depthwise convolution calculation in mobilenet.
+ * The config file api is img_depthwise_conv_layer.
  */
 
 class DepthwiseConvLayer : public ExpandConvBaseLayer {

From 198164adef6b668183092fa39dfb33fdbc39575e Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Mon, 10 Jul 2017 23:11:05 +0800
Subject: [PATCH 008/100] use the expandconvlayer  forward and backward, add
 the explain for class

---
 paddle/function/DepthwiseConvOp.cpp          |   3 -
 paddle/function/DepthwiseConvOp.h            |   4 +-
 paddle/function/DepthwiseConvOpGpu.cu        |  22 +---
 paddle/gserver/layers/DepthwiseConvLayer.cpp | 104 -------------------
 paddle/gserver/layers/DepthwiseConvLayer.h   |  16 +--
 5 files changed, 9 insertions(+), 140 deletions(-)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 8dcd32b067..358135e9a1 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -81,7 +81,6 @@ public:
                   int paddingH,
                   int paddingW,
                   T* colData,
-                  T* multiplierData,
                   T* filterGrad) {}
 };
 
@@ -247,7 +246,6 @@ public:
 
     real* outputGrad = inputs[0].data<real>();
     real* inputData = inputs[1].data<real>();
-    real* multiplierData = inputs[2].data<real>();
     real* filterGrad = outputs[0].data<real>();
 
     int size =
@@ -273,7 +271,6 @@ public:
                             paddingH(),
                             paddingW(),
                             colData,
-                            multiplierData,
                             filterGrad);
   }
 };
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index da180b29b0..5c5a70e5df 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -148,9 +148,7 @@ public:
                   int paddingH,
                   int paddingW,
                   T* colData,
-                  T* multiplierData,
                   T* filterGrad);
-
-};  // namespace paddle
+};
 
 }  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index df9be80b3f..5fb85df489 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "DepthwiseConvOp.h"
 #include "GemmFunctor.h"
+#include "paddle/math/BaseMatrix.h"
 
 namespace paddle {
 // CUDA kernel to compute the depthwise convolution forward pass
@@ -266,7 +267,6 @@ public:
                 int paddingH,
                 int paddingW,
                 T* colData,
-                T* multiplierData,
                 T* filterGrad){
 
         int colDataSize = inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
@@ -276,6 +276,7 @@ public:
         size_t blockY = (blocks+512-1)/512;
         dim3 threads(1024, 1);
         dim3 grid(blockX, blockY);
+		BaseMatrix filterGradMatrix(inputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
 
         for(int i = 0; i < batchSize; i++) {
 			ConvolutionDepthwiseFilterBackward<T>
@@ -298,25 +299,12 @@ public:
                     paddingW,
                     colData
 				);
-			GemmFunctor<DEVICE_TYPE_GPU, real> gemm;
 			int M = colDataSize / outputHeight / outputWidth;
-			int N = 1;
 			int K = outputHeight * outputWidth;
-			gemm(CblasNoTrans,
-				CblasNoTrans,
-				M,
-				N,
-				K,
-				(T)1.0,
-				colData,
-				K,
-				multiplierData,
-				N,
-				(T)1.0,
-				filterGrad,
-				N);
+
+            BaseMatrix colMatrix(M, K, colData, false, true);
+		    filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);	
 		}
-        //gemv
     }
 };
 
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.cpp b/paddle/gserver/layers/DepthwiseConvLayer.cpp
index 8da3a52c24..4b5f16d76b 100644
--- a/paddle/gserver/layers/DepthwiseConvLayer.cpp
+++ b/paddle/gserver/layers/DepthwiseConvLayer.cpp
@@ -29,18 +29,10 @@ bool DepthwiseConvLayer::init(const LayerMap &layerMap,
   inputShape_.resize(numInputs);
   filterShape_.resize(numInputs);
   outputShape_.resize(numInputs);
-  multiplierShape_.resize(numInputs);
-  weightMultiplier_.resize(numInputs);
 
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    Matrix::resizeOrCreate(weightMultiplier_[i],
-                           (size_t)outputH_[i] * (size_t)outputW_[i],
-                           (size_t)1,
-                           false,
-                           useGpu_);
-    weightMultiplier_[i]->one();
     createFunction(forward_,
                    "DepthwiseConv",
                    FuncConfig()
@@ -65,100 +57,4 @@ bool DepthwiseConvLayer::init(const LayerMap &layerMap,
   return true;
 }
 
-// i is the index of input layers
-#define BACKWARD_INPUT(i, inputs, outputs) \
-  backward_[2 * i]->calc(inputs, outputs)
-#define BACKWARD_FILTER(i, inputs, outputs) \
-  backward_[2 * i + 1]->calc(inputs, outputs)
-
-// compute the depthwise convolution forward pass
-void DepthwiseConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-
-  resetOutput(batchSize, getOutputSize());
-
-  // Calculate the shape of the input, output, and filter.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    inputShape_[i] = TensorShape({(size_t)batchSize,
-                                  (size_t)channels_[i],
-                                  (size_t)imgSizeH_[i],
-                                  (size_t)imgSizeW_[i]});
-    multiplierShape_[i] =
-        TensorShape({(size_t)outputH_[i] * (size_t)outputW_[i], (size_t)1});
-    filterShape_[i] = TensorShape({(size_t)groups_[i],
-                                   (size_t)numFilters_ / groups_[i],
-                                   (size_t)channels_[i] / groups_[i],
-                                   (size_t)filterSizeY_[i],
-                                   (size_t)filterSize_[i]});
-    outputShape_[i] = TensorShape({(size_t)batchSize,
-                                   (size_t)numFilters_,
-                                   (size_t)outputH_[i],
-                                   (size_t)outputW_[i]});
-  }
-
-  // Calculate the output value.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getInputValue(i), inputShape_[i]);
-    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-    outputs.addArg(
-        *getOutputValue(), outputShape_[i], i == 0 ? ASSIGN_TO : ADD_TO);
-
-    forward_[i]->calc(inputs, outputs);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get()) {
-    if (sharedBiases_) {
-      addSharedBias();
-    } else {
-      addUnsharedBias();
-    }
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-// compute the depthwise convolution backprop.
-void DepthwiseConvLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  MatrixPtr outGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases(outGrad);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // Calculate the input grad and filter grad.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (getInputGrad(i)) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
-      BACKWARD_INPUT(i, inputs, outputs);
-    }
-
-    if (weights_[i]->getWGrad()) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      inputs.addArg(*getInputValue(i), inputShape_[i]);
-      inputs.addArg(*weightMultiplier_[i], multiplierShape_[i]);
-      // weight_multiplier
-      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
-      BACKWARD_FILTER(i, inputs, outputs);
-
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.h b/paddle/gserver/layers/DepthwiseConvLayer.h
index c640d13b58..ce074803ab 100644
--- a/paddle/gserver/layers/DepthwiseConvLayer.h
+++ b/paddle/gserver/layers/DepthwiseConvLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "ExpandConvBaseLayer.h"
+#include "ExpandConvLayer.h"
 #include "paddle/math/Matrix.h"
 
 namespace paddle {
@@ -26,25 +26,15 @@ namespace paddle {
  * The config file api is img_depthwise_conv_layer.
  */
 
-class DepthwiseConvLayer : public ExpandConvBaseLayer {
+class DepthwiseConvLayer : public ExpandConvLayer {
 public:
   explicit DepthwiseConvLayer(const LayerConfig& config)
-      : ExpandConvBaseLayer(config) {}
+      : ExpandConvLayer(config) {}
 
   ~DepthwiseConvLayer() {}
 
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-protected:
-  std::vector<TensorShape> inputShape_;
-  std::vector<TensorShape> filterShape_;
-  std::vector<TensorShape> outputShape_;
-  std::vector<TensorShape> multiplierShape_;
-  std::vector<MatrixPtr> weightMultiplier_;
 };
 
 }  // namespace paddle

From a3ce6aa8ca052941d81c0bbd8e847d6e78549d30 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Mon, 10 Jul 2017 23:12:06 +0800
Subject: [PATCH 009/100] add depthwise conv test

---
 paddle/function/CMakeLists.txt          |   1 +
 paddle/function/DepthwiseConvOpTest.cpp | 208 ++++++++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100644 paddle/function/DepthwiseConvOpTest.cpp

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1518a8a654..8330c2be74 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -37,6 +37,7 @@ if(WITH_GPU)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
+    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 
 add_simple_unittest(ConvOpTest)
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
new file mode 100644
index 0000000000..6d0cc6f75d
--- /dev/null
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+#include "Function.h"
+#include "FunctionTest.h"
+
+namespace paddle {
+
+enum TestType {
+  kForwardTest = 0,
+  kBackwardInputTest = 1,
+  kBackwardFilterTest = 2,
+};
+
+template <DeviceType DType1, DeviceType DType2>
+class DepthwiseConvolutionTest {
+public:
+  DepthwiseConvolutionTest(const std::string& conv1,
+                           const std::string& conv2,
+                           TestType type,
+                           std::string algo = "auto") {
+    for (size_t batchSize : {1, 32}) {
+      for (size_t inputSize : {7, 14, 54}) {
+        for (size_t filterSize : {1, 3, 5}) {
+          for (size_t inputChannels : {64, 128}) {
+            size_t outputChannels = inputChannels;
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (padding >= filterSize) break;
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", algo));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{inputChannels, 1, 1, filterSize, filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                if (type == kForwardTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.run();
+                } else if (type == kBackwardInputTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                  test.run();
+                } else if (type == kBackwardFilterTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.run();
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Mainly used to test cases where the height and width (input, filter)
+// are not equal.
+template <DeviceType DType1, DeviceType DType2>
+class DepthwiseConvolutionTest2 {
+public:
+  DepthwiseConvolutionTest2(const std::string& conv1,
+                            const std::string& conv2,
+                            TestType type,
+                            std::string algo = "auto") {
+    for (size_t batchSize : {16}) {
+      for (size_t inputHeight : {7, 31}) {
+        for (size_t inputWidth : {10, 54}) {
+          for (size_t filterHeight : {1, 5}) {
+            for (size_t filterWidth : {3, 7}) {
+              for (size_t inputChannels : {32}) {
+                size_t outputChannels = inputChannels;
+                size_t stride = 1;
+                size_t padding = 0;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputHeight
+                        << " inputWidth=" << inputWidth
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterHeight
+                        << " filterWidth=" << filterWidth
+                        << " outputHeight=" << outputHeight
+                        << " outputWidth=" << outputWidth
+                        << " stride=" << stride << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", algo));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputHeight, inputWidth};
+                TensorShape filter{
+                    inputChannels, 1, 1, filterHeight, filterWidth};
+                TensorShape output{
+                    batchSize, outputChannels, outputHeight, outputWidth};
+
+                if (type == kForwardTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.run();
+                } else if (type == kBackwardInputTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                  test.run();
+                } else if (type == kBackwardFilterTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.run();
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Forward, GEMM2) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+}
+
+TEST(BackwardInput, GEMM) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConvGradInput-GPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConvGradInput-GPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+}
+
+TEST(BackwardFilter, GEMM) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConvGradFilter-GPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConvGradFilter-GPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+}
+#endif
+
+}  // namespace paddle

From fd4b1136a9b193686910e4b194b482b11f2d3261 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 12 Jul 2017 15:25:18 +0800
Subject: [PATCH 010/100] move DepthwiseConvOpTest.cpp to ConvOpTest.cpp

---
 paddle/function/CMakeLists.txt          |   1 -
 paddle/function/ConvOpTest.cpp          | 194 ++++++++++++++++++++++
 paddle/function/DepthwiseConvOpTest.cpp | 208 ------------------------
 3 files changed, 194 insertions(+), 209 deletions(-)
 delete mode 100644 paddle/function/DepthwiseConvOpTest.cpp

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 8330c2be74..1518a8a654 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -37,7 +37,6 @@ if(WITH_GPU)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
-    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 
 add_simple_unittest(ConvOpTest)
diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
index dfa2f78461..61f0c18bed 100644
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@@ -177,6 +177,156 @@ public:
   }
 };
 
+template <DeviceType DType1, DeviceType DType2>
+class DepthwiseConvolutionTest {
+public:
+  DepthwiseConvolutionTest(const std::string& conv1,
+                           const std::string& conv2,
+                           TestType type,
+                           std::string algo = "auto") {
+    for (size_t batchSize : {1, 32}) {
+      for (size_t inputSize : {7, 14, 54}) {
+        for (size_t filterSize : {1, 3, 5}) {
+          for (size_t inputChannels : {64, 128}) {
+            size_t outputChannels = inputChannels;
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (padding >= filterSize) break;
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", algo));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{inputChannels, 1, 1, filterSize, filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                if (type == kForwardTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.run();
+                } else if (type == kBackwardInputTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                  test.run();
+                } else if (type == kBackwardFilterTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.run();
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Mainly used to test cases where the height and width (input, filter)
+// are not equal.
+template <DeviceType DType1, DeviceType DType2>
+class DepthwiseConvolutionTest2 {
+public:
+  DepthwiseConvolutionTest2(const std::string& conv1,
+                            const std::string& conv2,
+                            TestType type,
+                            std::string algo = "auto") {
+    for (size_t batchSize : {16}) {
+      for (size_t inputHeight : {7, 31}) {
+        for (size_t inputWidth : {10, 54}) {
+          for (size_t filterHeight : {1, 5}) {
+            for (size_t filterWidth : {3, 7}) {
+              for (size_t inputChannels : {32}) {
+                size_t outputChannels = inputChannels;
+                size_t stride = 1;
+                size_t padding = 0;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputHeight
+                        << " inputWidth=" << inputWidth
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterHeight
+                        << " filterWidth=" << filterWidth
+                        << " outputHeight=" << outputHeight
+                        << " outputWidth=" << outputWidth
+                        << " stride=" << stride << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", algo));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputHeight, inputWidth};
+                TensorShape filter{
+                    inputChannels, 1, 1, filterHeight, filterWidth};
+                TensorShape output{
+                    batchSize, outputChannels, outputHeight, outputWidth};
+
+                if (type == kForwardTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.run();
+                } else if (type == kBackwardInputTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                  test.run();
+                } else if (type == kBackwardFilterTest) {
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                  test.run();
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// ======Start Convolution TEST======
 TEST(Forward, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
       "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
@@ -206,5 +356,49 @@ TEST(BackwardFilter, GEMM) {
       "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
 }
 #endif
+// ======End Convolution TEST======
+
+// ======Start DepthwiseConvolution TEST======
+// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
+// version of depthwiseConv is implemented.
+
+#ifndef PADDLE_ONLY_CPU
+TEST(DepthwiseConvForward, GEMM) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+}
+
+TEST(DepthwiseConvForward, GEMM2) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+}
+
+TEST(DepthwiseConvBackwardInput, GEMM) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConvGradInput-GPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConvGradInput-GPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+}
+
+TEST(DepthwiseConvBackwardFilter, GEMM) {
+  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConvGradFilter-GPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConvGradFilter-GPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+}
+#endif
+// ======End DepthwiseConvolution TEST======
 
 }  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
deleted file mode 100644
index 6d0cc6f75d..0000000000
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "Function.h"
-#include "FunctionTest.h"
-
-namespace paddle {
-
-enum TestType {
-  kForwardTest = 0,
-  kBackwardInputTest = 1,
-  kBackwardFilterTest = 2,
-};
-
-template <DeviceType DType1, DeviceType DType2>
-class DepthwiseConvolutionTest {
-public:
-  DepthwiseConvolutionTest(const std::string& conv1,
-                           const std::string& conv2,
-                           TestType type,
-                           std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {64, 128}) {
-            size_t outputChannels = inputChannels;
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                if (padding >= filterSize) break;
-                size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputSize
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
-
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                size_t groups = inputChannels;
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", groups)
-                        .set("algo", algo));
-
-                TensorShape input{
-                    batchSize, inputChannels, inputSize, inputSize};
-                TensorShape filter{inputChannels, 1, 1, filterSize, filterSize};
-                TensorShape output{
-                    batchSize, outputChannels, outputSize, outputSize};
-
-                if (type == kForwardTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.run();
-                } else if (type == kBackwardInputTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                  test.run();
-                } else if (type == kBackwardFilterTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.run();
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-// Mainly used to test cases where the height and width (input, filter)
-// are not equal.
-template <DeviceType DType1, DeviceType DType2>
-class DepthwiseConvolutionTest2 {
-public:
-  DepthwiseConvolutionTest2(const std::string& conv1,
-                            const std::string& conv2,
-                            TestType type,
-                            std::string algo = "auto") {
-    for (size_t batchSize : {16}) {
-      for (size_t inputHeight : {7, 31}) {
-        for (size_t inputWidth : {10, 54}) {
-          for (size_t filterHeight : {1, 5}) {
-            for (size_t filterWidth : {3, 7}) {
-              for (size_t inputChannels : {32}) {
-                size_t outputChannels = inputChannels;
-                size_t stride = 1;
-                size_t padding = 0;
-                size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
-                size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputHeight
-                        << " inputWidth=" << inputWidth
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterHeight
-                        << " filterWidth=" << filterWidth
-                        << " outputHeight=" << outputHeight
-                        << " outputWidth=" << outputWidth
-                        << " stride=" << stride << " padding=" << padding;
-
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                size_t groups = inputChannels;
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", groups)
-                        .set("algo", algo));
-
-                TensorShape input{
-                    batchSize, inputChannels, inputHeight, inputWidth};
-                TensorShape filter{
-                    inputChannels, 1, 1, filterHeight, filterWidth};
-                TensorShape output{
-                    batchSize, outputChannels, outputHeight, outputWidth};
-
-                if (type == kForwardTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.run();
-                } else if (type == kBackwardInputTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                  test.run();
-                } else if (type == kBackwardFilterTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.run();
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-#ifndef PADDLE_ONLY_CPU
-TEST(Forward, GEMM2) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
-      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
-}
-
-TEST(BackwardInput, GEMM) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "DepthwiseConvGradInput-GPU",
-      "DepthwiseConvGradInput-GPU",
-      kBackwardInputTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "DepthwiseConvGradInput-GPU",
-      "DepthwiseConvGradInput-GPU",
-      kBackwardInputTest);
-}
-
-TEST(BackwardFilter, GEMM) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "DepthwiseConvGradFilter-GPU",
-      "DepthwiseConvGradFilter-GPU",
-      kBackwardFilterTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "DepthwiseConvGradFilter-GPU",
-      "DepthwiseConvGradFilter-GPU",
-      kBackwardFilterTest);
-}
-#endif
-
-}  // namespace paddle

From 2bc08f8914ef45a53c163482e9af6b7a86a54d7b Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 12 Jul 2017 15:55:02 +0800
Subject: [PATCH 011/100] modify format accored with clang-format 3.8

---
 python/paddle/trainer/config_parser.py         | 5 +----
 python/paddle/trainer_config_helpers/layers.py | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9610e52186..2079aaa89f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1811,9 +1811,6 @@ class DepthwiseConvLayer(LayerBase):
         use_gpu = int(g_command_config_args.get("use_gpu", 0))
         parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
 
-        # Automatically select cudnn_type for GPU and exconv for CPU
-        # if set type=conv, but still reserve the way user specify
-        # exconv or cudnn_conv manually.
         self.layer_type = "depthwise_conv"
         # need to specify layer in config
         self.config.type = self.layer_type
@@ -1824,7 +1821,7 @@ class DepthwiseConvLayer(LayerBase):
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             conv_conf = self.config.inputs[input_index].conv_conf
-            #set the groups
+            #set the groups, the groups equals the input channels
             self.inputs[input_index].conv.groups = self.inputs[
                 input_index].conv.channels
             parse_conv(self.inputs[input_index].conv, input_layer.name,
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index c07c879191..40ac3698bb 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2337,6 +2337,7 @@ def img_depthwise_conv_layer(input,
         shared_biases=shared_biases,
         type=lt,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
+
     return LayerOutput(
         name,
         lt,

From ccd46d1bf66c9fc639f5994cb882fcc9e06c9c27 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 12 Jul 2017 15:56:56 +0800
Subject: [PATCH 012/100] modify format accored with clang-format 3.8

---
 paddle/function/DepthwiseConvOp.cpp        | 2 ++
 paddle/function/DepthwiseConvOp.h          | 7 ++-----
 paddle/function/DepthwiseConvOpGpu.cu      | 1 +
 paddle/gserver/layers/DepthwiseConvLayer.h | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 358135e9a1..31eccda67d 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -60,6 +60,7 @@ public:
                   int paddingH,
                   int paddingW,
                   T* inputGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
 };
 
 template <class T>
@@ -82,6 +83,7 @@ public:
                   int paddingW,
                   T* colData,
                   T* filterGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
 };
 
 /*
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 5c5a70e5df..356ff37c6a 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -122,11 +122,8 @@ public:
  * \param[in]   paddingH          padding size in height direction.
  * \param[in]   paddingW          padding size in width direction.
  * \param[in]   colData           Auxiliary data when calculating filterGrad.
- *                                size:
- *inputChannels*filterHeight*filterWidth*outputHeight*outputWidth \param[in]
- *multiplierData    Auxiliary data when calculating filterGrad. size:
- *outputHeight * outputWidth. \param[out]
- *filterGrad        the grad data of filter.
+ * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
+ * \param[out]  filterGrad        the grad data of filter.
  *
  */
 template <DeviceType Device, class T>
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 5fb85df489..737f091ab8 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/math/BaseMatrix.h"
 
 namespace paddle {
+
 // CUDA kernel to compute the depthwise convolution forward pass
 template <class T>
 __global__ 
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.h b/paddle/gserver/layers/DepthwiseConvLayer.h
index ce074803ab..1b154bd99d 100644
--- a/paddle/gserver/layers/DepthwiseConvLayer.h
+++ b/paddle/gserver/layers/DepthwiseConvLayer.h
@@ -22,7 +22,7 @@ namespace paddle {
 
 /**
  * @brief A subclass of convolution layer.
- * This layer do the depthwise convolution calculation in mobilenet.
+ * This layer does the depthwise convolution calculation of mobilenet.
  * The config file api is img_depthwise_conv_layer.
  */
 

From 030a3db20ffdf5c93f453cea4d9cbff5dbb48419 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 12 Jul 2017 21:09:49 +0800
Subject: [PATCH 013/100] the groups default should be None

---
 python/paddle/trainer_config_helpers/layers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 40ac3698bb..351bd8fea8 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2269,7 +2269,7 @@ def img_depthwise_conv_layer(input,
                              name=None,
                              num_channels=None,
                              act=None,
-                             groups=1,
+                             groups=None,
                              stride=1,
                              padding=0,
                              bias_attr=None,
@@ -2286,6 +2286,8 @@ def img_depthwise_conv_layer(input,
         assert input.num_filters is not None
         num_channels = input.num_filters
 
+    groups = num_channels
+
     if filter_size_y is None:
         if isinstance(filter_size, collections.Sequence):
             assert len(filter_size) == 2

From c43f6936c07145890deda97e9f101b8c50f89a1b Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Fri, 14 Jul 2017 11:16:36 +0800
Subject: [PATCH 014/100] modify the format and delete useless comment

---
 paddle/function/DepthwiseConvOp.cpp   | 12 ++--
 paddle/function/DepthwiseConvOp.h     |  4 +-
 paddle/function/DepthwiseConvOpGpu.cu | 84 ++++++++++++++-------------
 3 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 31eccda67d..0ac83f5824 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "DepthwiseConvOp.h"
 #include "ConvOp.h"
 #include "GemmFunctor.h"
-//#include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
 
@@ -28,6 +27,7 @@ public:
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
                   int inputWidth,
                   int filterHeight,
@@ -114,7 +114,7 @@ public:
     const TensorShape& output = outputs[0].shape();
 
     size_t batchSize = input[0];
-    // size_t inputChannels = input[1];
+    size_t inputChannels = input[1];
     size_t inputHeight = input[2];
     size_t inputWidth = input[3];
     size_t filterHeight = getFilterHeight(filter);
@@ -134,6 +134,7 @@ public:
                   outputChannels,
                   outputHeight,
                   outputWidth,
+                  inputChannels,
                   inputHeight,
                   inputWidth,
                   filterHeight,
@@ -168,8 +169,6 @@ public:
     CHECK_EQ(numInputs_, inputs.size());
     CHECK_EQ(numOutputs_, outputs.size());
     check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     const TensorShape& output = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
@@ -228,12 +227,11 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // CHECK_EQ(numInputs_, inputs.size());
-    // CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
     check(inputs, outputs);
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();
-    // const TensorShape& multiplier = inputs[2].shape();
     const TensorShape& filter = outputs[0].shape();
 
     size_t batchSize = input[0];
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 356ff37c6a..2b9bef4cd7 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -29,6 +29,7 @@ namespace paddle {
  * \param[in]   outputChannels    channels of outputData.
  * \param[in]   outputHeight      height of outputData.
  * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of inputData.
  * \param[in]   inputHeight       height of inputData.
  * \param[in]   inputWidth        width of inputData..
  * \param[in]   filterHeight      height of filter.
@@ -49,8 +50,9 @@ public:
                   int outputChannels,
                   int outputHeight,
                   int outputWidth,
+                  int inputChannels,
                   int inputHeight,
-                  int intputWidth,
+                  int inputWidth,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 737f091ab8..7740b7022d 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -24,7 +24,7 @@ __global__
 void ConvolutionDepthwiseForward(const int nthreads,
     const T* const inputData, const T* const filterData,
     const int batchSize, const int outputChannels, const int outputHeight,
-    const int outputWidth, const int inputHeight, const int inputWidth,
+    const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
     const int filterHeight, const int filterWidth, const int strideH,
     const int strideW, const int paddingH, const int paddingW,
     T* const outputData) {
@@ -39,36 +39,36 @@ void ConvolutionDepthwiseForward(const int nthreads,
     const int w = index % outputWidth;
     const T* weight = filterData + c * filterHeight * filterWidth;
     T value = 0;
-	const int h_in_start = -paddingH + h * strideH;
-	const int w_in_start = -paddingW + w * strideW;
-	const int h_in_end = -paddingH + h * strideH + filterHeight - 1;
-	const int w_in_end = -paddingW + w * strideW + filterWidth - 1;
+    const int h_in_start = -paddingH + h * strideH;
+    const int w_in_start = -paddingW + w * strideW;
+    const int h_in_end = -paddingH + h * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w * strideW + filterWidth - 1;
     if ((h_in_start >= 0) && (h_in_end < inputHeight) 
 		 &&(w_in_start >= 0) && (w_in_end < inputWidth)) {
-		for (int kh = 0; kh < filterHeight; ++kh) {
-		  for (int kw = 0; kw < filterWidth; ++kw) {
-			const int h_in = -paddingH + h * strideH + kh;
-			const int w_in = -paddingW + w * strideW + kw;
-			const int offset = ((n * outputChannels + c) * inputHeight + h_in)
+        for (int kh = 0; kh < filterHeight; ++kh) {
+            for (int kw = 0; kw < filterWidth; ++kw) {
+                const int h_in = -paddingH + h * strideH + kh;
+                const int w_in = -paddingW + w * strideW + kw;
+                const int offset = ((n * inputChannels + c) * inputHeight + h_in)
 					* inputWidth + w_in;
-			  value += (*weight) * inputData[offset];
-			++weight;
-		  }
-		}
-	}else{
-		for (int kh = 0; kh < filterHeight; ++kh) {
-		  for (int kw = 0; kw < filterWidth; ++kw) {
-			const int h_in = -paddingH + h * strideH + kh;
-			const int w_in = -paddingW + w * strideW + kw;
-			if ((h_in >= 0) && (h_in < inputHeight)
-				  && (w_in >= 0) && (w_in < inputWidth)) {
-			  const int offset = ((n * outputChannels + c) * inputHeight + h_in)
-					* inputWidth + w_in;
-			  value += (*weight) * inputData[offset];
-			}
-			++weight;
+                value += (*weight) * inputData[offset];
+                ++weight;
 		  }
 		}
+    }else{
+        for (int kh = 0; kh < filterHeight; ++kh) {
+            for (int kw = 0; kw < filterWidth; ++kw) {
+                const int h_in = -paddingH + h * strideH + kh;
+                const int w_in = -paddingW + w * strideW + kw;
+                if ((h_in >= 0) && (h_in < inputHeight)
+                   && (w_in >= 0) && (w_in < inputWidth)) {
+                    const int offset = ((n * outputChannels + c) * inputHeight + h_in)
+                        * inputWidth + w_in;
+                    value += (*weight) * inputData[offset];
+                }
+                ++weight;
+            }
+       }
 	}
     outputData[index] = value;
   }
@@ -80,15 +80,15 @@ __global__
 void ConvolutionDepthwiseInputBackward(const int nthreads,
     const T* const top_diff, const T* const weight_data,
     const int num, const int outputChannels, const int outputHeight,
-    const int outputWidth, const int inputHeight, const int inputWidth,
+    const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
     const int filterHeight, const int filterWidth, const int strideH,
     const int strideW, const int paddingH, const int paddingW,
      T* const bottom_diff) {
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   if(index < nthreads) {
-    const int n = index / outputChannels / inputHeight / inputWidth;
-    const int c = (index / inputHeight / inputWidth) % outputChannels;
+    const int n = index / inputChannels / inputHeight / inputWidth;
+    const int c = (index / inputHeight / inputWidth) % inputChannels;
     const int h = (index / inputWidth) % inputHeight;
     const int w = index % inputWidth;
     const T* weight = weight_data + c * filterHeight * filterWidth;
@@ -100,7 +100,7 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
         if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
           const int h_out = h_out_s / strideH;
           const int w_out = w_out_s / strideW;
-	     // TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
+	      // TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
           if ((h_out >= 0) && (h_out < outputHeight)
                 && (w_out >= 0) && (w_out < outputWidth)) {
             const int offset = ((n * outputChannels + c) * outputHeight + h_out)
@@ -121,7 +121,7 @@ __global__
 void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
     const T* const top_diff, const T* const inputData,
     const int num, const int outputChannels, const int outputHeight,
-    const int outputWidth, const int inputHeight, const int inputWidth,
+    const int outputWidth, const int inputChannels, const int inputHeight, const int inputWidth,
     const int filterHeight, const int filterWidth, const int strideH,
     const int strideW, const int paddingH, const int paddingW,
     T* const buffer_data) {
@@ -141,7 +141,7 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
       const int n = num_i;
       const int top_offset = ((n * outputChannels + c) * outputHeight + h)
             * outputWidth + w;
-      const int bottom_offset = ((n * outputChannels + c) * inputHeight + h_in)
+      const int bottom_offset = ((n * inputChannels + c) * inputHeight + h_in)
             * inputWidth + w_in;
       buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
     } else {
@@ -159,6 +159,7 @@ public:
             int outputChannels,
             int outputHeight,
             int outputWidth,
+			int inputChannels,
             int inputHeight,
             int inputWidth,
             int filterHeight,
@@ -186,6 +187,7 @@ public:
             outputChannels,
             outputHeight,
             outputWidth,
+			inputChannels,
             inputHeight,
             inputWidth,
             filterHeight,
@@ -218,7 +220,7 @@ public:
             int paddingW,
             T* inputGrad){
 
-	int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
 
     size_t blocks = (inputSize + 1024 -1) / 1024;
     size_t blockX = 512;
@@ -237,6 +239,7 @@ public:
             outputChannels,
             outputHeight,
             outputWidth,
+			inputChannels,
             inputHeight,
             inputWidth,
             filterHeight,
@@ -277,11 +280,11 @@ public:
         size_t blockY = (blocks+512-1)/512;
         dim3 threads(1024, 1);
         dim3 grid(blockX, blockY);
-		BaseMatrix filterGradMatrix(inputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
+        BaseMatrix filterGradMatrix(inputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
 
         for(int i = 0; i < batchSize; i++) {
-			ConvolutionDepthwiseFilterBackward<T>
-				<<< grid, threads, 0, STREAM_DEFAULT >>>(
+            ConvolutionDepthwiseFilterBackward<T>
+                <<< grid, threads, 0, STREAM_DEFAULT >>>(
                     i,
                     colDataSize,
                     outputGrad,
@@ -290,6 +293,7 @@ public:
                     outputChannels,
                     outputHeight,
                     outputWidth,
+					inputChannels,
                     inputHeight,
                     inputWidth,
                     filterHeight,
@@ -299,12 +303,12 @@ public:
                     paddingH,
                     paddingW,
                     colData
-				);
-			int M = colDataSize / outputHeight / outputWidth;
-			int K = outputHeight * outputWidth;
+            );
+            int M = colDataSize / outputHeight / outputWidth;
+            int K = outputHeight * outputWidth;
 
             BaseMatrix colMatrix(M, K, colData, false, true);
-		    filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);	
+            filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);	
 		}
     }
 };

From 89a4158038028c1a278ddec791e15bcff8307460 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Tue, 18 Jul 2017 16:15:36 +0800
Subject: [PATCH 015/100] enable MKLDNN library and MKL small package

---
 CMakeLists.txt                |  7 +++
 cmake/cblas.cmake             | 40 +++++++++++----
 cmake/configure.cmake         |  6 +++
 cmake/external/mkldnn.cmake   | 78 +++++++++++++++++++++++++++++
 paddle/math/MathFunctions.cpp | 93 ++++++++++++++++++-----------------
 paddle/math/MathFunctions.h   |  6 +++
 6 files changed, 176 insertions(+), 54 deletions(-)
 create mode 100644 cmake/external/mkldnn.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcff6b54ca..5e664d1415 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ON)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -94,6 +95,7 @@ include(external/glog)      # download, build, install glog
 include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
+include(external/mkldnn)    # download, build, install mkldnn
 include(external/openblas)  # download, build, install openblas
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
@@ -136,6 +138,11 @@ if(WITH_GPU)
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 
+if(WITH_MKLDNN)
+    message(STATUS "MKLDNN_LIBRARY: ${MKLDNN_LIBRARY}")
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKL_LITE_LIB_IOMP})
+endif()
+
 if(USE_NNPACK)
     include(external/nnpack)
     list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 913f711aff..ee654e64bd 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,22 +16,42 @@
 set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
-set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
+set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
+set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
+
+set(MKL_INCLUDE_SEARCH_PATHS
+  ${MKL_ROOT}/include
+  ${INTEL_MKL_ROOT}/include)
+set(MKL_LIB_SEARCH_PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64
+  ${INTEL_MKL_ROOT}/lib
+  ${INTEL_MKL_ROOT}/lib/intel64)
+
+if(MKL_LITE_INC_DIR AND MKL_LITE_LIB)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER MKL_LITE)
+  set(CBLAS_INC_DIR ${MKL_LITE_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKL_LITE_LIB})
+
+  add_definitions(-DPADDLE_USE_MKL_LITE)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found cblas and lapack in MKL Lite "
+    "(include: ${MKL_LITE_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  return()
+endif()
 
 find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 
 if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_FOUND ON)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 7afab5d534..8719197682 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -67,6 +67,12 @@ else()
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_MKLDNN)
+    add_definitions(-DPADDLE_USE_MKLDNN)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif(WITH_MKLDNN)
+
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
 
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
new file mode 100644
index 0000000000..834f5ae230
--- /dev/null
+++ b/cmake/external/mkldnn.cmake
@@ -0,0 +1,78 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLDNN})
+  return()
+ENDIF(NOT ${WITH_MKLDNN})
+
+INCLUDE(ExternalProject)
+
+SET(MKLDNN_PROJECT "extern_mkldnn")
+SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
+SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
+SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+
+# The following magic numbers should be updated regularly to keep latest version
+SET(MKLDNN_TAG "v0.9")
+SET(MKLDNN_MKL_VER "mklml_lnx_2018.0.20170425")
+
+IF(WIN32)
+    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
+      "Force WITH_MKLDNN=OFF")
+    SET(WITH_MKLDNN OFF)
+    return()
+ELSE(WIN32)
+    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+    MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
+    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+
+SET(MKLDNN_CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+SET(MKLDNN_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
+ExternalProject_Add(
+    ${MKLDNN_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY    "https://github.com/01org/mkl-dnn.git"
+    GIT_TAG           "${MKLDNN_TAG}"
+    PREFIX            ${MKLDNN_SOURCES_DIR}
+    PATCH_COMMAND     cd <SOURCE_DIR>/scripts && ./prepare_mkl.sh
+    UPDATE_COMMAND    ""
+    CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS        -DCMAKE_CXX_FLAGS=${MKLDNN_CMAKE_CXX_FLAGS}
+    CMAKE_ARGS        -DCMAKE_C_FLAGS=${MKLDNN_CMAKE_C_FLAGS}
+    CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+    CMAKE_ARGS        -DCMAKE_INSTALL_LIBDIR=${MKLDNN_INSTALL_DIR}/lib
+    CMAKE_ARGS        -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS  -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
+                      -DCMAKE_INSTALL_LIBDIR:PATH=${MKLDNN_INSTALL_DIR}/lib
+                      -DCMAKE_BUILD_TYPE:STRING=Release
+)
+
+SET(MKL_LITE_DIR ${MKLDNN_SOURCES_DIR}/src/${MKLDNN_PROJECT}/external/${MKLDNN_MKL_VER})
+SET(MKL_LITE_INC_DIR ${MKL_LITE_DIR}/include)
+SET(MKL_LITE_LIB ${MKL_LITE_DIR}/lib/libmklml_intel.so)
+SET(MKL_LITE_LIB_IOMP ${MKL_LITE_DIR}/lib/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKL_LITE_DIR}/lib")
+
+ADD_LIBRARY(mkldnn STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+
+LIST(APPEND external_project_dependencies mkldnn)
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 7045562dd4..999b72cc15 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
 
-#ifdef PADDLE_USE_MKL
+#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKL_LITE)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@@ -243,7 +243,55 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template <class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template <class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
 
+#endif
+
+#ifdef PADDLE_USE_MKL
 template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
   vsInvSqrt(n, a, r);
@@ -275,20 +323,6 @@ void vTanh<double>(const int n, const double* a, double* r) {
 }
 #else
 
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -312,41 +346,12 @@ void vTanh(const int n, const T* a, T* r) {
       binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
 
 #endif
 
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8ada0d34c6..799948cf08 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,6 +15,12 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
+#ifdef PADDLE_USE_MKL_LITE
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
 #include <mkl_lapacke.h>

From b6c075527c9810457cb5ca1c5d04ba34a8c5e2a2 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 18 Jul 2017 20:14:57 +0800
Subject: [PATCH 016/100] implement some basic OpKernel

---
 paddle/operators/add_op.cc         |  5 ++---
 paddle/operators/add_op.cu         |  3 +--
 paddle/operators/mul_op.cc         |  2 +-
 paddle/operators/mul_op.cu         |  2 +-
 paddle/operators/mul_op.h          | 17 ++++++++++++---
 paddle/operators/rowwise_add_op.cc |  2 +-
 paddle/operators/rowwise_add_op.cu |  2 +-
 paddle/operators/rowwise_add_op.h  | 19 ++++++++++++++---
 paddle/operators/sigmoid_op.cc     |  3 ++-
 paddle/operators/sigmoid_op.cu     |  2 +-
 paddle/operators/sigmoid_op.h      | 12 ++++++++---
 paddle/operators/softmax_op.cc     |  5 ++++-
 paddle/operators/softmax_op.cu     |  2 +-
 paddle/operators/softmax_op.h      | 34 +++++++++++++++++++++++++++---
 14 files changed, 85 insertions(+), 25 deletions(-)

diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 41d044cdb7..260c8064ac 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -53,6 +53,5 @@ The equation is: Out = X + Y
 }  // namespace paddle
 
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
-typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
-    AddKernel_CPU_float;
-REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
+REGISTER_OP_CPU_KERNEL(
+    add_two, paddle::operators::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index 0edf142ee4..2e5a755f92 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,6 +1,5 @@
 #include "paddle/operators/add_op.h"
 #include "paddle/framework/op_registry.h"
 
-typedef paddle::operators::AddKernel<::paddle::platform::GPUPlace, float> AddKernel_GPU_float;
 REGISTER_OP_GPU_KERNEL(add_two,
-                       AddKernel_GPU_float);
\ No newline at end of file
+                       paddle::operators::AddKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 713b2a5dc8..7aa63961a0 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -57,4 +57,4 @@ The equation is: Out = X * Y
 
 REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
+    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index 201723df24..75f00e746c 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -17,4 +17,4 @@
 
 REGISTER_OP_GPU_KERNEL(mul,
                        paddle::operators::MulKernel<paddle::platform
-                       ::GPUPlace>);
\ No newline at end of file
+                       ::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index ce8a0169e0..13e5b6a950 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -20,11 +20,22 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Mul kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+    dim_pair[0].first = 1;
+    dim_pair[0].second = 0;
+
+    auto input0 = context.Input(0)->Get<framework::Tensor>();
+    auto input1 = context.Input(1)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    output->mutable_data<T>(context.GetPlace());
+
+    output->matrix<T>().device(*(context.GetEigenDevice<Place>())) =
+        input0.matrix<T>().contract(input1.matrix<T>(), dim_pair);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 414bafd046..567b058fd0 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -58,4 +58,4 @@ REGISTER_OP(rowwise_add,
             paddle::operators::RowWiseAddOpMaker);
 REGISTER_OP_CPU_KERNEL(
     rowwise_add,
-    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
+    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 2c4bfbf93a..58fe96a4a3 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -3,4 +3,4 @@
 
 REGISTER_OP_GPU_KERNEL(
     rowwise_add,
-    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
+    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 35f43e6376..f1d43002dc 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -19,11 +19,24 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+template <typename Place, typename T>
 class RowWiseAddKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto in0 = context.Input(0)->Get<framework::Tensor>();
+    auto in1 = context.Input(1)->Get<framework::Tensor>();
+    auto* out = context.Output(0)->GetMutable<framework::Tensor>();
+
+    auto input = in0.matrix<T>();
+    auto bias = in1.vec<T>();
+    auto output = out->matrix<T>();
+
+    const int bias_size = bias.dimension(0);
+    const int rest_size = input.size() / bias_size;
+    Eigen::DSizes<int, 1> one_d(input.size());
+    Eigen::DSizes<int, 1> bcast(rest_size);
+    output.reshape(one_d).device(*(context.GetEigenDevice<Place>())) =
+        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
   }
 };
 
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 45ae277c53..fa13f2c4f7 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -46,4 +46,5 @@ REGISTER_OP(sigmoid,
             paddle::operators::SigmoidOp,
             paddle::operators::SigmoidOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
+    sigmoid,
+    paddle::operators::SigmoidKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
index 79d5222348..59bba2729f 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -2,4 +2,4 @@
 #include <paddle/framework/op_registry.h>
 
 REGISTER_OP_GPU_KERNEL(
-    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 42173343f3..7995b75297 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -20,11 +20,17 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+template <typename Place, typename T>
 class SigmoidKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto input = context.Input(0)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    output->mutable_data<T>(context.GetPlace());
+
+    output->flat<T>().device(*(context.GetEigenDevice<Place>())) =
+        1.0 / (1.0 + (-1.0 * input.flat<T>()).exp());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 4ca7be359e..42795adbdc 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -23,6 +23,8 @@ protected:
       const std::vector<const framework::Tensor *> &inputs,
       const std::vector<framework::Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(inputs[0]->dims().size() == 2,
+                   "The input of softmax op must be matrix");
     PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
 
     outputs[0]->set_dims(inputs[0]->dims());
@@ -46,4 +48,5 @@ public:
 namespace ops = paddle::operators;
 
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
-REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
+REGISTER_OP_CPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index 903eef1b62..730c76a04b 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -2,4 +2,4 @@
 #include <paddle/operators/softmax_op.h>
 
 REGISTER_OP_GPU_KERNEL(
-    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
+    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 74e9e2786b..34a6c299bb 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -20,11 +20,39 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto input = context.Input(0)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    auto logits = input.matrix<T>();
+    auto softmax = output->matrix<T>();
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits - logits.maximum(along_class)
+                                        .eval()
+                                        .reshape(batch_by_one)
+                                        .broadcast(one_by_class));
+
+    softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
+
+    softmax.device(*(context.GetEigenDevice<Place>())) =
+        (softmax * softmax.sum(along_class)
+                       .inverse()
+                       .eval()
+                       .reshape(batch_by_one)
+                       .broadcast(one_by_class));
   }
 };
 }  // namespace operators

From 816b4c8ab08306b79d3994deebdc51fdd0186bd5 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 18 Jul 2017 20:18:49 +0800
Subject: [PATCH 017/100] "add backward Op"

---
 paddle/framework/CMakeLists.txt        |   3 +
 paddle/framework/fully_connected_op.cc |  39 ++++++++++
 paddle/framework/fully_connected_op.h  |  52 +++++++++++++
 paddle/framework/net.cc                |  14 ++++
 paddle/framework/net.h                 |   2 +
 paddle/framework/net_op_test.cc        | 104 ++++++++++++++++---------
 paddle/framework/net_test.cc           |   5 +-
 paddle/framework/op_registry.h         |  47 ++++++++++-
 8 files changed, 226 insertions(+), 40 deletions(-)
 create mode 100644 paddle/framework/fully_connected_op.cc
 create mode 100644 paddle/framework/fully_connected_op.h

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index cc5b05ff0d..429a9a19a9 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -15,6 +15,8 @@ cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 cc_library(operator SRCS operator.cc DEPS op_desc device_context)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
+# cc_library(fc_op SRCS fully_connected_op.cc DEPS operator)
+
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
@@ -23,5 +25,6 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
 add_dependencies(framework_py_proto framework_py_proto_init)
 
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
+# cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
 cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net)
diff --git a/paddle/framework/fully_connected_op.cc b/paddle/framework/fully_connected_op.cc
new file mode 100644
index 0000000000..28be46366f
--- /dev/null
+++ b/paddle/framework/fully_connected_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/fully_connected_op.h"
+#include <iostream>
+namespace paddle {
+namespace framework {
+
+void FCOp::Run(const ScopePtr& scope,
+               const platform::DeviceContext& dev_ctx) const override {
+  std::cout << "FC" << std::endl;
+}
+
+void FCOp::InferShape(const ScopePtr& scope) const override {}
+
+void FCGradientOp::Run(const ScopePtr& scope,
+                       const platform::DeviceContext& dev_ctx) const override {
+  std::cout << "FCGrad" << std::endl;
+}
+
+void FCGradientOp::InferShape(const ScopePtr& scope) const override {}
+
+REGISTER_OP(my_fc, paddle::framework::FCOp,
+            paddle::framework::FCOpProtoAndCheckerMaker);
+REGISTER_OP(my_fc_grad, paddle::framework::FCGradientOp,
+            paddle::framework::FCGradientOpProtoAndCheckerMaker);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/fully_connected_op.h b/paddle/framework/fully_connected_op.h
new file mode 100644
index 0000000000..948116f653
--- /dev/null
+++ b/paddle/framework/fully_connected_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <iostream>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class FCOp : public OperatorBase {
+ public:
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    std::cout << "FC" << std::endl;
+  };
+  void InferShape(const ScopePtr& scope) const override{};
+};
+
+class FCOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  FCOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "input data");
+    AddInput("w", "weights");
+    AddInput("b", "bias");
+    AddOutput("y", "output data");
+    AddComment("Fully connnect op");
+  }
+};
+
+class FCGradientOp : public OperatorBase {
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    std::cout << "FCGrad" << std::endl;
+  };
+  void InferShape(const ScopePtr& scope) const override{};
+};
+
+// class FCGradientOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 7311cda9a9..1432915927 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -15,10 +15,24 @@
 */
 
 #include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
 
+std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps) {
+  // NetPtr->reset(new PlainNet);
+  // NetPtr grad_ops = new PlainNet;
+  std::shared_ptr<PlainNet> grad_ops;
+  grad_ops.reset(new PlainNet);
+  for (auto& op : ForwardOps->ops_) {
+    auto op_grad = OpRegistry::CreateGradOp(op);
+    grad_ops->AddOp(op_grad);
+  }
+  grad_ops->CompleteAddOp();
+  return grad_ops;
+}
+
 void PlainNet::CompleteAddOp() {
   std::unordered_set<std::string> input_set;
   std::unordered_set<std::string> output_set;
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 19a1620e29..354319001f 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -99,5 +99,7 @@ class PlainNet : public Net {
   }
 };
 
+std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index f5e1c22400..d61233a8b4 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -3,18 +3,17 @@
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/operator.h>
 
-namespace pd = paddle::framework;
+namespace paddle {
+namespace framework {
 
 static int infer_shape_cnt = 0;
 static int run_cnt = 0;
 
-class TestOp : public pd::OperatorBase {
+class TestOp : public OperatorBase {
  public:
-  void InferShape(const paddle::framework::ScopePtr& scope) const override {
-    ++infer_shape_cnt;
-  }
-  void Run(const paddle::framework::ScopePtr& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {
+  void InferShape(const ScopePtr& scope) const override { ++infer_shape_cnt; }
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
     ++run_cnt;
   }
 };
@@ -32,36 +31,65 @@ void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
   }
 }
 
+class PlainNetTest : public testing::Test {
+  virtual void SetUp() {
+    net_ = std::make_shared<PlainNet>();
+    ASSERT_NE(net_, nullptr);
+
+    auto op1 = std::make_shared<TestOp>();
+    op1->inputs_ = {"x", "w1", "b1"};
+    op1->outputs_ = {"y"};
+    net_->AddOp(op1);
+
+    auto op2 = std::make_shared<TestOp>();
+    op2->inputs_ = {"y", "w2", "b2"};
+    op2->outputs_ = {"z"};
+    net_->AddOp(op2);
+    net_->CompleteAddOp();
+  }
+
+  virtual void TearDown() {}
+
+  void TestOpKernel() {
+    AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net_->inputs_);
+    AssertSameVectorWithoutOrder({"y", "z"}, net_->outputs_);
+    auto tmp_idx_iter = net_->attrs_.find("temporary_index");
+    ASSERT_NE(net_->attrs_.end(), tmp_idx_iter);
+    auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+    ASSERT_EQ(1UL, tmp_idx.size());
+    ASSERT_EQ("y", net_->outputs_[tmp_idx[0]]);
+
+    auto scope = std::make_shared<Scope>();
+    platform::CPUDeviceContext dev_ctx;
+
+    net_->InferShape(scope);
+    net_->Run(scope, dev_ctx);
+    ASSERT_EQ(2, infer_shape_cnt);
+    ASSERT_EQ(2, run_cnt);
+
+    ASSERT_THROW(net_->AddOp(op2), EnforceNotMet);
+  }
+
+  void TestAddBackwardOp() {
+    auto grad_ops = AddBackwardOp(net_);
+    for (auto& op : grad_ops->ops_) {
+      op->DebugString();
+    }
+  }
+
+ private:
+  std::shared_ptr<PlainNet> net_;
+};
+
 TEST(OpKernel, all) {
-  auto net = std::make_shared<paddle::framework::PlainNet>();
-  ASSERT_NE(net, nullptr);
-
-  auto op1 = std::make_shared<TestOp>();
-  op1->inputs_ = {"x", "w1", "b1"};
-  op1->outputs_ = {"y"};
-  net->AddOp(op1);
-
-  auto op2 = std::make_shared<TestOp>();
-  op2->inputs_ = {"y", "w2", "b2"};
-  op2->outputs_ = {"z"};
-  net->AddOp(op2);
-
-  net->CompleteAddOp();
-  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
-  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
-  auto tmp_idx_iter = net->attrs_.find("temporary_index");
-  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
-  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
-  ASSERT_EQ(1UL, tmp_idx.size());
-  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
-
-  auto scope = std::make_shared<pd::Scope>();
-  paddle::platform::CPUDeviceContext dev_ctx;
-
-  net->InferShape(scope);
-  net->Run(scope, dev_ctx);
-  ASSERT_EQ(2, infer_shape_cnt);
-  ASSERT_EQ(2, run_cnt);
-
-  ASSERT_THROW(net->AddOp(op2), paddle::framework::EnforceNotMet);
+  PlainNetTest net;
+  net->TestOpKernel();
+}
+
+TEST(AddBackwardOp, TestAddBackwardOp) {
+  PlainNetTest net;
+  net->TestAddBackwardOp();
 }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net_test.cc b/paddle/framework/net_test.cc
index a8e31c1497..5afc0d9204 100644
--- a/paddle/framework/net_test.cc
+++ b/paddle/framework/net_test.cc
@@ -13,12 +13,15 @@
    limitations under the License. */
 
 #include "paddle/framework/net.h"
+#include "paddle/framework/fully_connected_op.h"
 #include "paddle/framework/op_registry.h"
 
 #include <gtest/gtest.h>
 
 namespace paddle {
 namespace framework {
-class FakeFC : public Operator {}
+
+TEST(AddBackwardOp, ALL)
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 24f56b2812..9183a8b1df 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -8,6 +8,7 @@
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/operator.h"
+#include "paddle/framework/scope.h"
 
 namespace paddle {
 namespace framework {
@@ -188,8 +189,8 @@ class OpRegistry {
   template <typename OpType, typename ProtoMakerType>
   static void RegisterOp(const std::string& op_type) {
     creators()[op_type] = [] { return new OpType; };
-    OpProto& op_proto = protos()[op_type];
     OpAttrChecker& op_checker = op_checkers()[op_type];
+    OpProto& op_proto = protos()[op_type];
     ProtoMakerType(&op_proto, &op_checker);
     *op_proto.mutable_type() = op_type;
     PADDLE_ENFORCE(
@@ -198,6 +199,11 @@ class OpRegistry {
         op_type, op_proto.InitializationErrorString());
   }
 
+  template <typename OpType>
+  static void RegisterGradOp(const std::string& op_type) {
+    grad_creators()[op_type] = [] { return new OpType; };
+  }
+
   static OperatorPtr CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
     OperatorPtr op(creators().at(op_type)());
@@ -216,6 +222,21 @@ class OpRegistry {
     return op;
   }
 
+  static OperatorPtr CreateGradOp(std::shared_ptr<OperatorBase> op) {
+    OperatorPtr op_grad(grad_creators().at(op->type_)());
+    op_grad->type_ = op->type_;
+    op_grad->inputs_.reserve(op->inputs_.size());
+    for (auto& input : op->inputs_) {
+      op_grad->inputs_.emplace_back(input);
+      op_grad->outputs_.emplace_back(input + "@grad");
+    }
+    for (auto& output : op->outputs_) {
+      op_grad->inputs_.emplace_back(output);
+      op_grad->inputs_.emplace_back(output + "@grad");
+    }
+    return op_grad;
+  }
+
   static std::unordered_map<std::string, OpProto>& protos() {
     static std::unordered_map<std::string, OpProto> protos_;
     return protos_;
@@ -231,6 +252,11 @@ class OpRegistry {
     static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
     return op_checkers_;
   };
+
+  static std::unordered_map<std::string, OpCreator>& grad_creators() {
+    static std::unordered_map<std::string, OpCreator> grad_creators_;
+    return grad_creators_;
+  }
 };
 
 template <typename OpType, typename ProtoMakerType>
@@ -241,6 +267,14 @@ class OpRegisterHelper {
   }
 };
 
+template <typename OpType>
+class GradOpRegisterHelper {
+ public:
+  GradOpRegisterHelper(const char* op_type) {
+    OpRegistry::RegisterGradOp<OpType>(op_type);
+  }
+};
+
 /**
  * check if MACRO is used in GLOBAL NAMESPACE.
  */
@@ -260,6 +294,17 @@ class OpRegisterHelper {
       __op_register_##__op_type##__(#__op_type);                             \
   int __op_register_##__op_type##_handle__() { return 0; }
 
+/**
+ * Macro to Register Operator.
+ */
+#define REGISTER_GRADIENT_OP(__op_type, __op_class)            \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
+      __reg_op__##__op_type,                                   \
+      "REGISTER_GRADIENT_OP must be in global namespace");     \
+  static ::paddle::framework::GradOpRegisterHelper<__op_class> \
+      __op_register_##__op_type##__(#__op_type);               \
+  int __op_register_##__op_type##_handle__() { return 0; }
+
 /**
  * Macro to Register OperatorKernel.
  */

From 8b80cf898917066daf12900473a832268a2e965e Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 18 Jul 2017 21:03:52 +0800
Subject: [PATCH 018/100] "add net op testing"

---
 paddle/framework/net_op_test.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index d61233a8b4..814f397c7d 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -32,6 +32,7 @@ void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
 }
 
 class PlainNetTest : public testing::Test {
+ public:
   virtual void SetUp() {
     net_ = std::make_shared<PlainNet>();
     ASSERT_NE(net_, nullptr);
@@ -50,6 +51,8 @@ class PlainNetTest : public testing::Test {
 
   virtual void TearDown() {}
 
+  virtual void TestBody() {}
+
   void TestOpKernel() {
     AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net_->inputs_);
     AssertSameVectorWithoutOrder({"y", "z"}, net_->outputs_);
@@ -67,6 +70,7 @@ class PlainNetTest : public testing::Test {
     ASSERT_EQ(2, infer_shape_cnt);
     ASSERT_EQ(2, run_cnt);
 
+    auto op2 = std::make_shared<TestOp>();
     ASSERT_THROW(net_->AddOp(op2), EnforceNotMet);
   }
 
@@ -83,12 +87,12 @@ class PlainNetTest : public testing::Test {
 
 TEST(OpKernel, all) {
   PlainNetTest net;
-  net->TestOpKernel();
+  net.TestOpKernel();
 }
 
 TEST(AddBackwardOp, TestAddBackwardOp) {
   PlainNetTest net;
-  net->TestAddBackwardOp();
+  net.TestAddBackwardOp();
 }
 
 }  // namespace framework

From 02e04b44411a851a86217815e7d740c634d8324f Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Tue, 18 Jul 2017 22:04:53 +0800
Subject: [PATCH 019/100] fuse the conv and depthwise conv together

---
 paddle/function/ConvOpTest.cpp | 281 ++++++++++++---------------------
 1 file changed, 104 insertions(+), 177 deletions(-)

diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
index 61f0c18bed..27609fbbd4 100644
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@@ -25,11 +25,17 @@ enum TestType {
   kBackwardFilterTest = 2,
 };
 
+enum LayerType {
+  convolutionType = 0,
+  depthwiseConvolutionType = 1,
+};
+
 template <DeviceType DType1, DeviceType DType2>
 class ConvolutionTest {
 public:
   ConvolutionTest(const std::string& conv1,
                   const std::string& conv2,
+                  LayerType layerType,
                   TestType type,
                   std::string algo = "auto") {
     for (size_t batchSize : {1, 32}) {
@@ -37,7 +43,17 @@ public:
         for (size_t filterSize : {1, 3, 5}) {
           for (size_t inputChannels : {3, 64}) {
             for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels < outputChannels) break;
+              if (inputChannels > outputChannels) break;
+              if (layerType == depthwiseConvolutionType &&
+                  outputChannels % inputChannels != 0)
+                break;
+
+              size_t groups = 1;
+
+              if (layerType == depthwiseConvolutionType) {
+                groups = inputChannels;
+              }
+
               for (size_t stride : {1, 2}) {
                 for (size_t padding : {0, 1}) {
                   if (padding >= filterSize) break;
@@ -62,13 +78,24 @@ public:
                       FuncConfig()
                           .set("paddings", paddings)
                           .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                           .set("algo", algo));
 
                   TensorShape input{
                       batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterSize, filterSize};
+
+                  TensorShape filter;
+                  if (layerType == depthwiseConvolutionType)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          (size_t)1,
+                                          filterSize,
+                                          filterSize});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterSize,
+                                          filterSize});
                   TensorShape output{
                       batchSize, outputChannels, outputSize, outputSize};
 
@@ -105,6 +132,7 @@ class ConvolutionTest2 {
 public:
   ConvolutionTest2(const std::string& conv1,
                    const std::string& conv2,
+                   LayerType layerType,
                    TestType type,
                    std::string algo = "auto") {
     for (size_t batchSize : {16}) {
@@ -113,7 +141,16 @@ public:
           for (size_t filterHeight : {1, 5}) {
             for (size_t filterWidth : {3, 7}) {
               for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {32}) {
+                for (size_t outputChannels : {7, 32}) {
+                  if (layerType == depthwiseConvolutionType &&
+                      outputChannels % inputChannels != 0)
+                    break;
+
+                  size_t groups = 1;
+
+                  if (layerType == depthwiseConvolutionType) {
+                    groups = inputChannels;
+                  }
                   size_t stride = 1;
                   size_t padding = 0;
                   size_t outputHeight =
@@ -141,13 +178,24 @@ public:
                       FuncConfig()
                           .set("paddings", paddings)
                           .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                           .set("algo", algo));
 
                   TensorShape input{
                       batchSize, inputChannels, inputHeight, inputWidth};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterHeight, filterWidth};
+
+                  TensorShape filter;
+                  if (layerType == depthwiseConvolutionType)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          (size_t)1,
+                                          filterHeight,
+                                          filterWidth});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterHeight,
+                                          filterWidth});
                   TensorShape output{
                       batchSize, outputChannels, outputHeight, outputWidth};
 
@@ -177,183 +225,46 @@ public:
   }
 };
 
-template <DeviceType DType1, DeviceType DType2>
-class DepthwiseConvolutionTest {
-public:
-  DepthwiseConvolutionTest(const std::string& conv1,
-                           const std::string& conv2,
-                           TestType type,
-                           std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {64, 128}) {
-            size_t outputChannels = inputChannels;
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                if (padding >= filterSize) break;
-                size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputSize
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
-
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                size_t groups = inputChannels;
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", groups)
-                        .set("algo", algo));
-
-                TensorShape input{
-                    batchSize, inputChannels, inputSize, inputSize};
-                TensorShape filter{inputChannels, 1, 1, filterSize, filterSize};
-                TensorShape output{
-                    batchSize, outputChannels, outputSize, outputSize};
-
-                if (type == kForwardTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.run();
-                } else if (type == kBackwardInputTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                  test.run();
-                } else if (type == kBackwardFilterTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.run();
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-// Mainly used to test cases where the height and width (input, filter)
-// are not equal.
-template <DeviceType DType1, DeviceType DType2>
-class DepthwiseConvolutionTest2 {
-public:
-  DepthwiseConvolutionTest2(const std::string& conv1,
-                            const std::string& conv2,
-                            TestType type,
-                            std::string algo = "auto") {
-    for (size_t batchSize : {16}) {
-      for (size_t inputHeight : {7, 31}) {
-        for (size_t inputWidth : {10, 54}) {
-          for (size_t filterHeight : {1, 5}) {
-            for (size_t filterWidth : {3, 7}) {
-              for (size_t inputChannels : {32}) {
-                size_t outputChannels = inputChannels;
-                size_t stride = 1;
-                size_t padding = 0;
-                size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
-                size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputHeight
-                        << " inputWidth=" << inputWidth
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterHeight
-                        << " filterWidth=" << filterWidth
-                        << " outputHeight=" << outputHeight
-                        << " outputWidth=" << outputWidth
-                        << " stride=" << stride << " padding=" << padding;
-
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                size_t groups = inputChannels;
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", groups)
-                        .set("algo", algo));
-
-                TensorShape input{
-                    batchSize, inputChannels, inputHeight, inputWidth};
-                TensorShape filter{
-                    inputChannels, 1, 1, filterHeight, filterWidth};
-                TensorShape output{
-                    batchSize, outputChannels, outputHeight, outputWidth};
-
-                if (type == kForwardTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.run();
-                } else if (type == kBackwardInputTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                  test.run();
-                } else if (type == kBackwardFilterTest) {
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                  test.run();
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
 // ======Start Convolution TEST======
 TEST(Forward, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", convolutionType, kForwardTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", convolutionType, kForwardTest);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(Forward, GEMM2) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", convolutionType, kForwardTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", convolutionType, kForwardTest);
 }
 
 TEST(BackwardInput, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      convolutionType,
+      kBackwardInputTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      convolutionType,
+      kBackwardInputTest);
 }
 
 TEST(BackwardFilter, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      convolutionType,
+      kBackwardFilterTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      convolutionType,
+      kBackwardFilterTest);
 }
 #endif
 // ======End Convolution TEST======
@@ -364,38 +275,54 @@ TEST(BackwardFilter, GEMM) {
 
 #ifndef PADDLE_ONLY_CPU
 TEST(DepthwiseConvForward, GEMM) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-GPU", "DepthwiseConv-GPU", kForwardTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+  ConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-GPU",
+      "DepthwiseConv-GPU",
+      depthwiseConvolutionType,
+      kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-GPU",
+      "DepthwiseConv-GPU",
+      depthwiseConvolutionType,
+      kForwardTest);
 }
 
 TEST(DepthwiseConvForward, GEMM2) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
-      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "DepthwiseConv-GPU", "DepthwiseConv-GPU", kForwardTest);
+  ConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
+      "DepthwiseConv-GPU",
+      "DepthwiseConv-GPU",
+      depthwiseConvolutionType,
+      kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "DepthwiseConv-GPU",
+      "DepthwiseConv-GPU",
+      depthwiseConvolutionType,
+      kForwardTest);
 }
 
 TEST(DepthwiseConvBackwardInput, GEMM) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
       "DepthwiseConvGradInput-GPU",
       "DepthwiseConvGradInput-GPU",
+      depthwiseConvolutionType,
       kBackwardInputTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
       "DepthwiseConvGradInput-GPU",
       "DepthwiseConvGradInput-GPU",
+      depthwiseConvolutionType,
       kBackwardInputTest);
 }
 
 TEST(DepthwiseConvBackwardFilter, GEMM) {
-  DepthwiseConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
       "DepthwiseConvGradFilter-GPU",
       "DepthwiseConvGradFilter-GPU",
+      depthwiseConvolutionType,
       kBackwardFilterTest);
-  DepthwiseConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
       "DepthwiseConvGradFilter-GPU",
       "DepthwiseConvGradFilter-GPU",
+      depthwiseConvolutionType,
       kBackwardFilterTest);
 }
 #endif

From 11588b36700cc1dd444b524c4cff0d785fe7f769 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Tue, 18 Jul 2017 22:07:26 +0800
Subject: [PATCH 020/100] support inputchannels != outputchannels of
 depthwiseconv

---
 paddle/function/DepthwiseConvOp.cpp     |  13 ++-
 paddle/function/DepthwiseConvOp.h       |  10 +-
 paddle/function/DepthwiseConvOpGpu.cu   | 117 +++++++++++++-----------
 paddle/gserver/tests/test_LayerGrad.cpp |   2 +-
 4 files changed, 85 insertions(+), 57 deletions(-)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 0ac83f5824..d1430239bc 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -30,6 +30,7 @@ public:
                   int inputChannels,
                   int inputHeight,
                   int inputWidth,
+                  int filterMultiplier,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
@@ -53,6 +54,7 @@ public:
                   int inputChannels,
                   int inputHeight,
                   int inputWidth,
+                  int filterMultiplier,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
@@ -75,6 +77,7 @@ public:
                   int inputChannels,
                   int inputHeight,
                   int inputWidth,
+                  int filterMultiplier,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
@@ -122,6 +125,7 @@ public:
     size_t outputChannels = output[1];
     size_t outputHeight = output[2];
     size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
 
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
@@ -137,6 +141,7 @@ public:
                   inputChannels,
                   inputHeight,
                   inputWidth,
+                  filterMultiplier,
                   filterHeight,
                   filterWidth,
                   strideH(),
@@ -183,6 +188,7 @@ public:
     size_t outputChannels = output[1];
     size_t outputHeight = output[2];
     size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
 
     real* outputGrad = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
@@ -198,6 +204,7 @@ public:
                            inputChannels,
                            inputHeight,
                            inputWidth,
+                           filterMultiplier,
                            filterHeight,
                            filterWidth,
                            strideH(),
@@ -243,13 +250,14 @@ public:
     size_t outputChannels = output[1];
     size_t outputHeight = output[2];
     size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
 
     real* outputGrad = inputs[0].data<real>();
     real* inputData = inputs[1].data<real>();
     real* filterGrad = outputs[0].data<real>();
 
-    int size =
-        inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
+    int size = outputChannels * filterHeight * filterWidth * outputHeight *
+               outputWidth;
     resizeBuffer<Device>(size);
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
@@ -264,6 +272,7 @@ public:
                             inputChannels,
                             inputHeight,
                             inputWidth,
+                            filterMultiplier,
                             filterHeight,
                             filterWidth,
                             strideH(),
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 2b9bef4cd7..1bf70e52f3 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -32,6 +32,7 @@ namespace paddle {
  * \param[in]   inputChannels     channels of inputData.
  * \param[in]   inputHeight       height of inputData.
  * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
  * \param[in]   filterHeight      height of filter.
  * \param[in]   filterWidth       widht of filter.
  * \param[in]   strideH           stride size in height direction.
@@ -53,6 +54,7 @@ public:
                   int inputChannels,
                   int inputHeight,
                   int inputWidth,
+                  int filterMultiplier,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
@@ -74,7 +76,8 @@ public:
  * \param[in]   outputWidth       width of outputData.
  * \param[in]   inputChannels     channels of input data.
  * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData..
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
  * \param[in]   filterHeight      height of filter.
  * \param[in]   filterWidth       widht of filter.
  * \param[in]   strideH           stride size in height direction.
@@ -96,6 +99,7 @@ public:
                   int inputChannels,
                   int inputHeight,
                   int inputWidth,
+                  int filterMultiplier,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
@@ -116,7 +120,8 @@ public:
  * \param[in]   outputWidth       width of outputData.
  * \param[in]   inputChannels     channels of input data.
  * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData..
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
  * \param[in]   filterHeight      height of filter.
  * \param[in]   filterWidth       widht of filter.
  * \param[in]   strideH           stride size in height direction.
@@ -140,6 +145,7 @@ public:
                   int inputChannels,
                   int inputHeight,
                   int inputWidth,
+                  int filterMultiplier,
                   int filterHeight,
                   int filterWidth,
                   int strideH,
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 7740b7022d..51aed9ffcf 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -25,7 +25,7 @@ void ConvolutionDepthwiseForward(const int nthreads,
     const T* const inputData, const T* const filterData,
     const int batchSize, const int outputChannels, const int outputHeight,
     const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
-    const int filterHeight, const int filterWidth, const int strideH,
+    const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
     const int strideW, const int paddingH, const int paddingW,
     T* const outputData) {
 
@@ -33,23 +33,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   
   if(index < nthreads) {
-    const int n = index / outputChannels / outputHeight / outputWidth;
-    const int c = (index / outputHeight / outputWidth) % outputChannels;
-    const int h = (index / outputWidth) % outputHeight;
-    const int w = index % outputWidth;
-    const T* weight = filterData + c * filterHeight * filterWidth;
+    const int batch = index / outputChannels / outputHeight / outputWidth;
+    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+
+	const int c_in = c_out / filterMultiplier;
+    const T* weight = filterData + c_out * filterHeight * filterWidth;
     T value = 0;
-    const int h_in_start = -paddingH + h * strideH;
-    const int w_in_start = -paddingW + w * strideW;
-    const int h_in_end = -paddingH + h * strideH + filterHeight - 1;
-    const int w_in_end = -paddingW + w * strideW + filterWidth - 1;
+    const int h_in_start = -paddingH + h_out * strideH;
+    const int w_in_start = -paddingW + w_out * strideW;
+    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
     if ((h_in_start >= 0) && (h_in_end < inputHeight) 
 		 &&(w_in_start >= 0) && (w_in_end < inputWidth)) {
         for (int kh = 0; kh < filterHeight; ++kh) {
             for (int kw = 0; kw < filterWidth; ++kw) {
-                const int h_in = -paddingH + h * strideH + kh;
-                const int w_in = -paddingW + w * strideW + kw;
-                const int offset = ((n * inputChannels + c) * inputHeight + h_in)
+                const int h_in = -paddingH + h_out * strideH + kh;
+                const int w_in = -paddingW + w_out * strideW + kw;
+                const int offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
 					* inputWidth + w_in;
                 value += (*weight) * inputData[offset];
                 ++weight;
@@ -58,11 +60,11 @@ void ConvolutionDepthwiseForward(const int nthreads,
     }else{
         for (int kh = 0; kh < filterHeight; ++kh) {
             for (int kw = 0; kw < filterWidth; ++kw) {
-                const int h_in = -paddingH + h * strideH + kh;
-                const int w_in = -paddingW + w * strideW + kw;
+                const int h_in = -paddingH + h_out * strideH + kh;
+                const int w_in = -paddingW + w_out * strideW + kw;
                 if ((h_in >= 0) && (h_in < inputHeight)
                    && (w_in >= 0) && (w_in < inputWidth)) {
-                    const int offset = ((n * outputChannels + c) * inputHeight + h_in)
+                    const int offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
                         * inputWidth + w_in;
                     value += (*weight) * inputData[offset];
                 }
@@ -81,38 +83,42 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
     const T* const top_diff, const T* const weight_data,
     const int num, const int outputChannels, const int outputHeight,
     const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
-    const int filterHeight, const int filterWidth, const int strideH,
+    const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
     const int strideW, const int paddingH, const int paddingW,
      T* const bottom_diff) {
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   if(index < nthreads) {
-    const int n = index / inputChannels / inputHeight / inputWidth;
-    const int c = (index / inputHeight / inputWidth) % inputChannels;
-    const int h = (index / inputWidth) % inputHeight;
-    const int w = index % inputWidth;
-    const T* weight = weight_data + c * filterHeight * filterWidth;
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+	const int c_out_start = c_in * filterMultiplier;
     T value = 0;
-    for (int kh = 0; kh < filterHeight; ++kh) {
-      for (int kw = 0; kw < filterWidth; ++kw) {
-        const int h_out_s = h + paddingH - kh;
-        const int w_out_s = w + paddingW - kw;
-        if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
-          const int h_out = h_out_s / strideH;
-          const int w_out = w_out_s / strideW;
-	      // TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
-          if ((h_out >= 0) && (h_out < outputHeight)
-                && (w_out >= 0) && (w_out < outputWidth)) {
-            const int offset = ((n * outputChannels + c) * outputHeight + h_out)
-                  * outputWidth + w_out;
-            value += (*weight) * top_diff[offset];
-          }
+	for(int c_out = c_out_start; c_out < c_out_start + filterMultiplier; c_out ++){
+	//weight bixu c_out
+        const T* weight = weight_data + c_out * filterHeight * filterWidth;
+        for (int kh = 0; kh < filterHeight; ++kh) {
+            for (int kw = 0; kw < filterWidth; ++kw) {
+                const int h_out_s = h_in + paddingH - kh;
+                const int w_out_s = w_in + paddingW - kw;
+                if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
+                    const int h_out = h_out_s / strideH;
+                    const int w_out = w_out_s / strideW;
+	                // TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
+                    if ((h_out >= 0) && (h_out < outputHeight)
+                        && (w_out >= 0) && (w_out < outputWidth)) {
+                        const int offset = ((batch * outputChannels + c_out) * outputHeight + h_out)
+                           * outputWidth + w_out;
+                        value += (*weight) * top_diff[offset];
+                    }
+                }
+                ++weight;
+             }
         }
-        ++weight;
-      }
     }
     bottom_diff[index] += value;
-  }
+   }
 }
 
 // CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
@@ -122,26 +128,27 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
     const T* const top_diff, const T* const inputData,
     const int num, const int outputChannels, const int outputHeight,
     const int outputWidth, const int inputChannels, const int inputHeight, const int inputWidth,
-    const int filterHeight, const int filterWidth, const int strideH,
+    const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
     const int strideW, const int paddingH, const int paddingW,
     T* const buffer_data) {
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   if (index < nthreads) {
-    const int h = (index / outputWidth) % outputHeight;
-    const int w = index % outputWidth;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
     const int kh = (index / filterWidth / outputHeight / outputWidth)
           % filterHeight;
     const int kw = (index / outputHeight / outputWidth) % filterWidth;
-    const int h_in = -paddingH + h * strideH + kh;
-    const int w_in = -paddingW + w * strideW + kw;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
     if ((h_in >= 0) && (h_in < inputHeight)
           && (w_in >= 0) && (w_in < inputWidth)) {
-      const int c = index / filterHeight / filterWidth / outputHeight / outputWidth;
-      const int n = num_i;
-      const int top_offset = ((n * outputChannels + c) * outputHeight + h)
-            * outputWidth + w;
-      const int bottom_offset = ((n * inputChannels + c) * inputHeight + h_in)
+      const int c_out = index / filterHeight / filterWidth / outputHeight / outputWidth;
+	  const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset = ((batch * outputChannels + c_out) * outputHeight + h_out)
+            * outputWidth + w_out;
+      const int bottom_offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
             * inputWidth + w_in;
       buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
     } else {
@@ -162,6 +169,7 @@ public:
 			int inputChannels,
             int inputHeight,
             int inputWidth,
+            int filterMultiplier,
             int filterHeight,
             int filterWidth,
             int strideH,
@@ -190,6 +198,7 @@ public:
 			inputChannels,
             inputHeight,
             inputWidth,
+            filterMultiplier,
             filterHeight,
             filterWidth,
             strideH,
@@ -212,6 +221,7 @@ public:
             int inputChannels,
             int inputHeight,
             int inputWidth,
+            int filterMultiplier,
             int filterHeight,
             int filterWidth,
             int strideH,
@@ -242,6 +252,7 @@ public:
 			inputChannels,
             inputHeight,
             inputWidth,
+            filterMultiplier,
             filterHeight,
             filterWidth,
             strideH,
@@ -264,6 +275,7 @@ public:
                 int inputChannels,
                 int inputHeight,
                 int inputWidth,
+                int filterMultiplier,
                 int filterHeight,
                 int filterWidth,
                 int strideH,
@@ -273,14 +285,14 @@ public:
                 T* colData,
                 T* filterGrad){
 
-        int colDataSize = inputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
+        int colDataSize = outputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
 
         size_t blocks = (colDataSize + 1024 -1) / 1024;
         size_t blockX = 512;
         size_t blockY = (blocks+512-1)/512;
         dim3 threads(1024, 1);
         dim3 grid(blockX, blockY);
-        BaseMatrix filterGradMatrix(inputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
+        BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
 
         for(int i = 0; i < batchSize; i++) {
             ConvolutionDepthwiseFilterBackward<T>
@@ -296,6 +308,7 @@ public:
 					inputChannels,
                     inputHeight,
                     inputWidth,
+					filterMultiplier,
                     filterHeight,
                     filterWidth,
                     strideH,
@@ -304,8 +317,8 @@ public:
                     paddingW,
                     colData
             );
-            int M = colDataSize / outputHeight / outputWidth;
             int K = outputHeight * outputWidth;
+            int M = colDataSize / K;
 
             BaseMatrix colMatrix(M, K, colData, false, true);
             filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);	
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 50e7a91d3f..2f28cec53e 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -355,7 +355,7 @@ void testDepthwiseConvLayer(const string& type, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 96});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192 / 2});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);

From d43fbbae748678fe87098536faaa880cff3206c2 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Tue, 18 Jul 2017 22:08:20 +0800
Subject: [PATCH 021/100] add comments for python api

---
 .../paddle/trainer_config_helpers/layers.py   | 68 ++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 351bd8fea8..f9457971cd 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2269,7 +2269,6 @@ def img_depthwise_conv_layer(input,
                              name=None,
                              num_channels=None,
                              act=None,
-                             groups=None,
                              stride=1,
                              padding=0,
                              bias_attr=None,
@@ -2281,11 +2280,78 @@ def img_depthwise_conv_layer(input,
                              padding_y=None,
                              trans=False,
                              layer_type=None):
+    """
+    DepthwiseConvolution layer for image. 
+
+    The details of depthwise convolution layer, please refer 
+    https://arxiv.org/abs/1704.04861
+    
+	The Depthwise Convolution layer must meet this requirement that the groups equals to the
+	inputChannels. And the groups must be divisible by outputChannels.
+	So the filter shape will be (groups, outputChannels/groups, 1, filter_size, filter_size_y)
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        conv = img_depthwise_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                              num_channels=8,
+                              num_filters=16, stride=1,
+                              bias_attr=False,
+                              act=ReluActivation())
+
+    :param name: Layer name.
+    :type name: basestring
+    :param input: Layer Input.
+    :type input: LayerOutput
+    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
+                        two image dimension.
+    :type filter_size: int|tuple|list
+    :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
+                        currently supports rectangular filters, the filter's
+                        shape will be (filter_size, filter_size_y).
+    :type filter_size_y: int|None
+    :param num_filters: Each filter group's number of filter
+    :param act: Activation type. Default is tanh
+    :type act: BaseActivation
+    :param stride: The x dimension of the stride. Or input a tuple for two image
+                   dimension.
+    :type stride: int|tuple|list
+    :param stride_y: The y dimension of the stride.
+    :type stride_y: int
+    :param padding: The x dimension of the padding. Or input a tuple for two
+                    image dimension
+    :type padding: int|tuple|list
+    :param padding_y: The y dimension of the padding.
+    :type padding_y: int
+    :param bias_attr: DepthwiseConvolution bias attribute. None means default bias.
+                      False means no bias.
+    :type bias_attr: ParameterAttribute|False
+    :param num_channels: number of input channels. If None will be set
+                        automatically from previous output.
+    :type num_channels: int
+    :param param_attr: DepthwiseConvolution param attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :param shared_biases: Is biases will be shared between filters or not.
+    :type shared_biases: bool
+    :param layer_attr: Layer Extra Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :type trans: bool
+    :param layer_type: specify the layer_type, default is None. If trans=True,
+                       layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or
+                       "cudnn_conv"
+    :type layer_type: String
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
 
     if num_channels is None:
         assert input.num_filters is not None
         num_channels = input.num_filters
 
+    # the groups in depthwise conv should be equal to input channels.
     groups = num_channels
 
     if filter_size_y is None:

From 7f1533f2518bfbfdb5b87f1769b5df6574ba7242 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 18 Jul 2017 22:29:30 +0800
Subject: [PATCH 022/100] test collaborating

---
 paddle/framework/op_registry.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index d26c0b50d8..98ef426b10 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -234,7 +234,7 @@ class OpRegistry {
                               const AttributeMap& attrs) {
     auto op_create_it = creators().find(type);
     PADDLE_ENFORCE(op_create_it != creators().end(),
-                   "Operator %s cannot be found", type);
+                   "Operator %s cannot be found.", type);
 
     auto op = op_create_it->second();
     op->type_ = type;

From 855cae603c5ae25408f394d4f68498a680603535 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 18 Jul 2017 22:31:59 +0800
Subject: [PATCH 023/100] move unused file

---
 paddle/framework/fully_connected_op.cc |  39 ----------
 paddle/framework/fully_connected_op.h  |   2 +
 paddle/framework/net_op_test.cc        | 102 +++++++++++--------------
 paddle/framework/net_test.cc           |  27 -------
 4 files changed, 45 insertions(+), 125 deletions(-)
 delete mode 100644 paddle/framework/fully_connected_op.cc
 delete mode 100644 paddle/framework/net_test.cc

diff --git a/paddle/framework/fully_connected_op.cc b/paddle/framework/fully_connected_op.cc
deleted file mode 100644
index 28be46366f..0000000000
--- a/paddle/framework/fully_connected_op.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/fully_connected_op.h"
-#include <iostream>
-namespace paddle {
-namespace framework {
-
-void FCOp::Run(const ScopePtr& scope,
-               const platform::DeviceContext& dev_ctx) const override {
-  std::cout << "FC" << std::endl;
-}
-
-void FCOp::InferShape(const ScopePtr& scope) const override {}
-
-void FCGradientOp::Run(const ScopePtr& scope,
-                       const platform::DeviceContext& dev_ctx) const override {
-  std::cout << "FCGrad" << std::endl;
-}
-
-void FCGradientOp::InferShape(const ScopePtr& scope) const override {}
-
-REGISTER_OP(my_fc, paddle::framework::FCOp,
-            paddle::framework::FCOpProtoAndCheckerMaker);
-REGISTER_OP(my_fc_grad, paddle::framework::FCGradientOp,
-            paddle::framework::FCGradientOpProtoAndCheckerMaker);
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/fully_connected_op.h b/paddle/framework/fully_connected_op.h
index 948116f653..f049eda9bb 100644
--- a/paddle/framework/fully_connected_op.h
+++ b/paddle/framework/fully_connected_op.h
@@ -47,6 +47,8 @@ class FCGradientOp : public OperatorBase {
 };
 
 // class FCGradientOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {};
+REGISTER_OP(my_fc, FCOp, FCOpProtoAndCheckerMaker);
+REGISTER_GRADIENT_OP(my_fc_grad, FCGradientOp);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index 814f397c7d..18151c56d9 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -2,6 +2,7 @@
 #include <paddle/framework/net.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/operator.h>
+#include "paddle/framework/fully_connected_op.h"
 
 namespace paddle {
 namespace framework {
@@ -31,68 +32,51 @@ void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
   }
 }
 
-class PlainNetTest : public testing::Test {
- public:
-  virtual void SetUp() {
-    net_ = std::make_shared<PlainNet>();
-    ASSERT_NE(net_, nullptr);
-
-    auto op1 = std::make_shared<TestOp>();
-    op1->inputs_ = {"x", "w1", "b1"};
-    op1->outputs_ = {"y"};
-    net_->AddOp(op1);
-
-    auto op2 = std::make_shared<TestOp>();
-    op2->inputs_ = {"y", "w2", "b2"};
-    op2->outputs_ = {"z"};
-    net_->AddOp(op2);
-    net_->CompleteAddOp();
-  }
-
-  virtual void TearDown() {}
-
-  virtual void TestBody() {}
-
-  void TestOpKernel() {
-    AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net_->inputs_);
-    AssertSameVectorWithoutOrder({"y", "z"}, net_->outputs_);
-    auto tmp_idx_iter = net_->attrs_.find("temporary_index");
-    ASSERT_NE(net_->attrs_.end(), tmp_idx_iter);
-    auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
-    ASSERT_EQ(1UL, tmp_idx.size());
-    ASSERT_EQ("y", net_->outputs_[tmp_idx[0]]);
-
-    auto scope = std::make_shared<Scope>();
-    platform::CPUDeviceContext dev_ctx;
-
-    net_->InferShape(scope);
-    net_->Run(scope, dev_ctx);
-    ASSERT_EQ(2, infer_shape_cnt);
-    ASSERT_EQ(2, run_cnt);
-
-    auto op2 = std::make_shared<TestOp>();
-    ASSERT_THROW(net_->AddOp(op2), EnforceNotMet);
-  }
-
-  void TestAddBackwardOp() {
-    auto grad_ops = AddBackwardOp(net_);
-    for (auto& op : grad_ops->ops_) {
-      op->DebugString();
-    }
-  }
-
- private:
-  std::shared_ptr<PlainNet> net_;
-};
-
 TEST(OpKernel, all) {
-  PlainNetTest net;
-  net.TestOpKernel();
+  auto net = std::make_shared<PlainNet>();
+  ASSERT_NE(net, nullptr);
+
+  auto op1 = std::make_shared<TestOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+
+  auto op2 = std::make_shared<TestOp>();
+  op2->inputs_ = {"y", "w2", "b2"};
+  op2->outputs_ = {"z"};
+  net->AddOp(op2);
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
+  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
+  auto tmp_idx_iter = net->attrs_.find("temporary_index");
+  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
+  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+  ASSERT_EQ(1UL, tmp_idx.size());
+  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
+
+  auto scope = std::make_shared<Scope>();
+  platform::CPUDeviceContext dev_ctx;
+
+  net->InferShape(scope);
+  net->Run(scope, dev_ctx);
+  ASSERT_EQ(2, infer_shape_cnt);
+  ASSERT_EQ(2, run_cnt);
+
+  ASSERT_THROW(net->AddOp(op2), EnforceNotMet);
 }
 
-TEST(AddBackwardOp, TestAddBackwardOp) {
-  PlainNetTest net;
-  net.TestAddBackwardOp();
+TEST(AddBackwardOp, TestGradOp) {
+  auto net = std::make_shared<PlainNet>();
+  ASSERT_NE(net, nullptr);
+  auto op1 = std::make_shared<FCOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+  auto grad_ops = AddBackwardOp(net);
+  for (auto& op : grad_ops->ops_) {
+    op->DebugString();
+  }
 }
 
 }  // namespace framework
diff --git a/paddle/framework/net_test.cc b/paddle/framework/net_test.cc
deleted file mode 100644
index 5afc0d9204..0000000000
--- a/paddle/framework/net_test.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/net.h"
-#include "paddle/framework/fully_connected_op.h"
-#include "paddle/framework/op_registry.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-
-TEST(AddBackwardOp, ALL)
-
-}  // namespace framework
-}  // namespace paddle

From dbb658805ef0b00d0ba91103b0884aa4ee483b86 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Tue, 18 Jul 2017 22:57:03 +0800
Subject: [PATCH 024/100] modity the format

---
 paddle/function/DepthwiseConvOp.cpp   |   9 +-
 paddle/function/DepthwiseConvOpGpu.cu | 116 +++++++++++++-------------
 2 files changed, 61 insertions(+), 64 deletions(-)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index d1430239bc..9180c19b11 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -99,8 +99,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
@@ -162,8 +161,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& input = outputs[0].shape();
@@ -225,8 +223,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();
     const TensorShape& filter = outputs[0].shape();
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 51aed9ffcf..bb7b97df5a 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -20,58 +20,58 @@ namespace paddle {
 
 // CUDA kernel to compute the depthwise convolution forward pass
 template <class T>
-__global__ 
+__global__
 void ConvolutionDepthwiseForward(const int nthreads,
     const T* const inputData, const T* const filterData,
     const int batchSize, const int outputChannels, const int outputHeight,
-    const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
-    const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
-    const int strideW, const int paddingH, const int paddingW,
-    T* const outputData) {
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const outputData) {
 
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  
-  if(index < nthreads) {
+
+  if (index < nthreads) {
     const int batch = index / outputChannels / outputHeight / outputWidth;
     const int c_out = (index / outputHeight / outputWidth) % outputChannels;
     const int h_out = (index / outputWidth) % outputHeight;
     const int w_out = index % outputWidth;
 
-	const int c_in = c_out / filterMultiplier;
+    const int c_in = c_out / filterMultiplier;
     const T* weight = filterData + c_out * filterHeight * filterWidth;
     T value = 0;
     const int h_in_start = -paddingH + h_out * strideH;
     const int w_in_start = -paddingW + w_out * strideW;
     const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
     const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
-    if ((h_in_start >= 0) && (h_in_end < inputHeight) 
-		 &&(w_in_start >= 0) && (w_in_end < inputWidth)) {
+    if ((h_in_start >= 0) && (h_in_end < inputHeight)
+       && (w_in_start >= 0) && (w_in_end < inputWidth)) {
         for (int kh = 0; kh < filterHeight; ++kh) {
             for (int kw = 0; kw < filterWidth; ++kw) {
                 const int h_in = -paddingH + h_out * strideH + kh;
                 const int w_in = -paddingW + w_out * strideW + kw;
-                const int offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
-					* inputWidth + w_in;
+                const int offset = ((batch * inputChannels + c_in)
+                    * inputHeight + h_in) * inputWidth + w_in;
                 value += (*weight) * inputData[offset];
                 ++weight;
-		  }
-		}
-    }else{
+            }
+        }
+    } else {
         for (int kh = 0; kh < filterHeight; ++kh) {
             for (int kw = 0; kw < filterWidth; ++kw) {
                 const int h_in = -paddingH + h_out * strideH + kh;
                 const int w_in = -paddingW + w_out * strideW + kw;
                 if ((h_in >= 0) && (h_in < inputHeight)
                    && (w_in >= 0) && (w_in < inputWidth)) {
-                    const int offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
-                        * inputWidth + w_in;
+                    const int offset = ((batch * inputChannels + c_in)
+                        * inputHeight + h_in) * inputWidth + w_in;
                     value += (*weight) * inputData[offset];
                 }
                 ++weight;
             }
        }
-	}
+    }
     outputData[index] = value;
   }
 }
@@ -82,21 +82,21 @@ __global__
 void ConvolutionDepthwiseInputBackward(const int nthreads,
     const T* const top_diff, const T* const weight_data,
     const int num, const int outputChannels, const int outputHeight,
-    const int outputWidth,const int inputChannels, const int inputHeight, const int inputWidth,
-    const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
-    const int strideW, const int paddingH, const int paddingW,
-     T* const bottom_diff) {
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const bottom_diff) {
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if(index < nthreads) {
+  if (index < nthreads) {
     const int batch = index / inputChannels / inputHeight / inputWidth;
     const int c_in = (index / inputHeight / inputWidth) % inputChannels;
     const int h_in = (index / inputWidth) % inputHeight;
     const int w_in = index % inputWidth;
-	const int c_out_start = c_in * filterMultiplier;
+    const int c_out_start = c_in * filterMultiplier;
     T value = 0;
-	for(int c_out = c_out_start; c_out < c_out_start + filterMultiplier; c_out ++){
-	//weight bixu c_out
+    for (int c_out = c_out_start;
+         c_out < c_out_start + filterMultiplier; c_out ++) {
         const T* weight = weight_data + c_out * filterHeight * filterWidth;
         for (int kh = 0; kh < filterHeight; ++kh) {
             for (int kw = 0; kw < filterWidth; ++kw) {
@@ -105,11 +105,12 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
                 if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
                     const int h_out = h_out_s / strideH;
                     const int w_out = w_out_s / strideW;
-	                // TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
+                    // TODO(zhaolong) : the 'if' affect the effectiveness,
+                    // it needs to optimize
                     if ((h_out >= 0) && (h_out < outputHeight)
                         && (w_out >= 0) && (w_out < outputWidth)) {
-                        const int offset = ((batch * outputChannels + c_out) * outputHeight + h_out)
-                           * outputWidth + w_out;
+                        const int offset = ((batch * outputChannels + c_out)
+                            * outputHeight + h_out) * outputWidth + w_out;
                         value += (*weight) * top_diff[offset];
                     }
                 }
@@ -127,10 +128,10 @@ __global__
 void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
     const T* const top_diff, const T* const inputData,
     const int num, const int outputChannels, const int outputHeight,
-    const int outputWidth, const int inputChannels, const int inputHeight, const int inputWidth,
-    const int filterMultiplier, const int filterHeight, const int filterWidth, const int strideH,
-    const int strideW, const int paddingH, const int paddingW,
-    T* const buffer_data) {
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const buffer_data) {
   int index =
     (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   if (index < nthreads) {
@@ -143,13 +144,14 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
     const int w_in = -paddingW + w_out * strideW + kw;
     if ((h_in >= 0) && (h_in < inputHeight)
           && (w_in >= 0) && (w_in < inputWidth)) {
-      const int c_out = index / filterHeight / filterWidth / outputHeight / outputWidth;
-	  const int c_in = c_out / filterMultiplier;
+      const int c_out = index /
+            (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
       const int batch = num_i;
-      const int top_offset = ((batch * outputChannels + c_out) * outputHeight + h_out)
-            * outputWidth + w_out;
-      const int bottom_offset = ((batch * inputChannels + c_in) * inputHeight + h_in)
-            * inputWidth + w_in;
+      const int top_offset = ((batch * outputChannels + c_out) *
+            outputHeight + h_out) * outputWidth + w_out;
+      const int bottom_offset = ((batch * inputChannels + c_in)
+            * inputHeight + h_in) * inputWidth + w_in;
       buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
     } else {
       buffer_data[index] = 0;
@@ -160,13 +162,13 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
 public:
-  void operator()(const T* inputData, 
+  void operator()(const T* inputData,
             const T* filterData,
             int batchSize,
             int outputChannels,
             int outputHeight,
             int outputWidth,
-			int inputChannels,
+            int inputChannels,
             int inputHeight,
             int inputWidth,
             int filterMultiplier,
@@ -177,7 +179,6 @@ public:
             int paddingH,
             int paddingW,
             T* outputData){
-
     int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
 
     size_t blocks = (outputSize + 1024 -1) / 1024;
@@ -188,14 +189,14 @@ public:
 
     ConvolutionDepthwiseForward<T>
         <<< grid, threads, 0, STREAM_DEFAULT >>>(
-            outputSize, 
-            inputData, 
+            outputSize,
+            inputData,
             filterData,
             batchSize,
             outputChannels,
             outputHeight,
             outputWidth,
-			inputChannels,
+            inputChannels,
             inputHeight,
             inputWidth,
             filterMultiplier,
@@ -229,7 +230,6 @@ public:
             int paddingH,
             int paddingW,
             T* inputGrad){
-
     int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
 
     size_t blocks = (inputSize + 1024 -1) / 1024;
@@ -249,7 +249,7 @@ public:
             outputChannels,
             outputHeight,
             outputWidth,
-			inputChannels,
+            inputChannels,
             inputHeight,
             inputWidth,
             filterMultiplier,
@@ -284,17 +284,18 @@ public:
                 int paddingW,
                 T* colData,
                 T* filterGrad){
-
-        int colDataSize = outputChannels * filterHeight * filterWidth * outputHeight * outputWidth;
+        int colDataSize = outputChannels * filterHeight * filterWidth
+            * outputHeight * outputWidth;
 
         size_t blocks = (colDataSize + 1024 -1) / 1024;
         size_t blockX = 512;
         size_t blockY = (blocks+512-1)/512;
         dim3 threads(1024, 1);
         dim3 grid(blockX, blockY);
-        BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth, 1, filterGrad, false, true);
+        BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+            1, filterGrad, false, true);
 
-        for(int i = 0; i < batchSize; i++) {
+        for (int i = 0; i < batchSize; i++) {
             ConvolutionDepthwiseFilterBackward<T>
                 <<< grid, threads, 0, STREAM_DEFAULT >>>(
                     i,
@@ -305,24 +306,23 @@ public:
                     outputChannels,
                     outputHeight,
                     outputWidth,
-					inputChannels,
+                    inputChannels,
                     inputHeight,
                     inputWidth,
-					filterMultiplier,
+                    filterMultiplier,
                     filterHeight,
                     filterWidth,
                     strideH,
                     strideW,
                     paddingH,
                     paddingW,
-                    colData
-            );
+                    colData);
             int K = outputHeight * outputWidth;
             int M = colDataSize / K;
 
             BaseMatrix colMatrix(M, K, colData, false, true);
-            filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);	
-		}
+            filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+        }
     }
 };
 
@@ -330,7 +330,7 @@ public:
 template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
 template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
 template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
-#else 
+#else
 template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
 template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
 template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;

From 66520af9ca9bcd1663e48ad48e9628e01535af96 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 19 Jul 2017 11:05:56 +0800
Subject: [PATCH 025/100] accelerate inputbackward(delete 'if' in this func) of
 depthwise conv

---
 paddle/function/DepthwiseConvOpGpu.cu | 42 +++++++++++++++------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index bb7b97df5a..28e6aa4a01 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
 #include "DepthwiseConvOp.h"
 #include "GemmFunctor.h"
 #include "paddle/math/BaseMatrix.h"
@@ -93,29 +94,32 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
     const int c_in = (index / inputHeight / inputWidth) % inputChannels;
     const int h_in = (index / inputWidth) % inputHeight;
     const int w_in = index % inputWidth;
+
     const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH)/strideH;
+    h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW)/strideW;
+    w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end;
+
     T value = 0;
+
     for (int c_out = c_out_start;
          c_out < c_out_start + filterMultiplier; c_out ++) {
-        const T* weight = weight_data + c_out * filterHeight * filterWidth;
-        for (int kh = 0; kh < filterHeight; ++kh) {
-            for (int kw = 0; kw < filterWidth; ++kw) {
-                const int h_out_s = h_in + paddingH - kh;
-                const int w_out_s = w_in + paddingW - kw;
-                if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) {
-                    const int h_out = h_out_s / strideH;
-                    const int w_out = w_out_s / strideW;
-                    // TODO(zhaolong) : the 'if' affect the effectiveness,
-                    // it needs to optimize
-                    if ((h_out >= 0) && (h_out < outputHeight)
-                        && (w_out >= 0) && (w_out < outputWidth)) {
-                        const int offset = ((batch * outputChannels + c_out)
-                            * outputHeight + h_out) * outputWidth + w_out;
-                        value += (*weight) * top_diff[offset];
-                    }
-                }
-                ++weight;
-             }
+        for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+            const int filter_h = h_in + paddingH - h_out * strideH;
+            for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+                const int filter_w = w_in + paddingW - w_out * strideW;
+                const int filter_offset = c_out * filterHeight * filterWidth
+                    + filter_h * filterWidth + filter_w;
+                const int top_diff_offset = ((batch * outputChannels + c_out) *
+                    outputHeight + h_out)* outputWidth + w_out;
+                value += top_diff[top_diff_offset] * weight_data[filter_offset];
+            }
         }
     }
     bottom_diff[index] += value;

From 028f3dc4e5fcb558041ff168e233a89b41aeaed9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 19 Jul 2017 13:13:19 +0800
Subject: [PATCH 026/100] Add memcpy

---
 paddle/memory/memory.cc | 14 ++++++++++++++
 paddle/memory/memory.h  |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 430ce98bfc..5be9bef3ac 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -47,6 +47,20 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace, void* src,
+                                                  size_t size) {
+  memcpy(dst, src, size);
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace, void* src,
+                                                  size_t size) {
+  memcpy(dst, src, size);
+}
+
 #ifndef PADDLE_ONLY_CPU
 
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 2d6f4fd2a0..96c00cb106 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -28,5 +28,8 @@ void Free(Place, void*);
 template <class Place>
 size_t Used(Place);
 
+template <class Place1, class Place2>
+void Copy(Place1, void* dst, Place2, void* src, size_t size);
+
 }  // namespace memory
 }  // namespace paddle

From bf4da3d97a6d116a8b4304c63a3debc0314bb1ba Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 19 Jul 2017 13:25:52 +0800
Subject: [PATCH 027/100] Refactor Rigistry::CreateGradOp()

We put forward Op's inputs, outputs and output gradients into Grad
Op's inputs, and put forward Op's input gradients into Grad Op's output.
So Grad Op's `in_out_idx`, `input_format` and 'output format' need to be
rebuilt during Op creating.
---
 paddle/framework/op_registry.h | 150 ++++++++++++++++++++++++++-------
 paddle/framework/operator.h    |   5 ++
 2 files changed, 126 insertions(+), 29 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 98ef426b10..6ba0784f1b 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -228,6 +228,11 @@ class OpRegistry {
     }
   }
 
+  template <typename OpType>
+  static void RegisterGradOp(const std::string& op_type) {
+    grad_creators()[op_type] = [] { return new OpType; };
+  }
+
   static OperatorPtr CreateOp(const std::string& type,
                               const VarNameList& inputs,
                               const VarNameList& outputs,
@@ -240,6 +245,7 @@ class OpRegistry {
     op->type_ = type;
     op->inputs_ = inputs;
     op->outputs_ = outputs;
+
     op->attrs_ = attrs;
     op_checkers().at(type).Check(op->attrs_);
 
@@ -256,11 +262,6 @@ class OpRegistry {
     return OperatorPtr(op);
   }
 
-  template <typename OpType>
-  static void RegisterGradOp(const std::string& op_type) {
-    grad_creators()[op_type] = [] { return new OpType; };
-  }
-
   static OperatorPtr CreateOp(const OpDesc& op_desc) {
     std::vector<std::string> inputs;
     inputs.reserve((size_t)op_desc.inputs_size());
@@ -280,19 +281,16 @@ class OpRegistry {
     return CreateOp(op_desc.type(), inputs, outputs, attrs);
   }
 
-  static OperatorPtr CreateGradOp(std::shared_ptr<OperatorBase> op) {
-    OperatorPtr op_grad(grad_creators().at(op->type_)());
-    op_grad->type_ = op->type_;
-    op_grad->inputs_.reserve(op->inputs_.size());
-    for (auto& input : op->inputs_) {
-      op_grad->inputs_.emplace_back(input);
-      op_grad->outputs_.emplace_back(input + "@grad");
-    }
-    for (auto& output : op->outputs_) {
-      op_grad->inputs_.emplace_back(output);
-      op_grad->inputs_.emplace_back(output + "@grad");
-    }
-    return op_grad;
+  static OperatorPtr CreateGradOp(OperatorPtr op) {
+    OperatorPtr grad_op(grad_creators().at(op->type_)());
+    grad_op->type_ = op->type_;
+
+    AssembleGradInOut(op, grad_op);
+    GenerateGradArgOffset(op, grad_op);
+    GenerateGradAttr(op, grad_op);
+
+    grad_op->Init();
+    return grad_op;
   }
 
   static std::unordered_map<std::string, OpProto>& protos() {
@@ -307,6 +305,21 @@ class OpRegistry {
     return maps_;
   }
 
+  static std::unordered_map<std::string, OpCreator>& creators() {
+    static std::unordered_map<std::string, OpCreator> creators_;
+    return creators_;
+  }
+
+  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
+    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
+    return op_checkers_;
+  };
+
+  static std::unordered_map<std::string, OpCreator>& grad_creators() {
+    static std::unordered_map<std::string, OpCreator> grad_creators_;
+    return grad_creators_;
+  }
+
   static void GenerateTempVariableName(OperatorBase* op) {
     static std::atomic<size_t> gUniqId(0UL);
     for (auto& outname : op->outputs_) {
@@ -318,19 +331,98 @@ class OpRegistry {
     }
   }
 
-  static std::unordered_map<std::string, OpCreator>& creators() {
-    static std::unordered_map<std::string, OpCreator> creators_;
-    return creators_;
+  static void AssembleGradInOut(OperatorPtr op, OperatorPtr grad_op) {
+    size_t in_sz = op->inputs_.size() + op->outputs_.size() * 2;
+    grad_op->inputs_.reserve(in_sz);
+    size_t out_sz = op->inputs_.size();
+    grad_op->outputs_.reserve(out_sz);
+    // copy op->inputs_ to grad_op->inputs_
+    std::copy(op->inputs_.begin(), op->inputs_.end(),
+              std::back_inserter(grad_op->inputs_));
+    // copy op->outputs_ to grad_op->inputs_
+    std::copy(op->outputs_.begin(), op->outputs_.end(),
+              std::back_inserter(grad_op->inputs_));
+    // add gradients of op->outputs_ to grad_op->inputs_
+    for (const std::string& name : op->outputs_) {
+      grad_op->inputs_.emplace_back(name + OperatorBase::GRAD_VAR_SUFFIX());
+    }
+    // add gradients of op->inputs_ to grad_op->outputs_
+    for (const std::string& name : op->inputs_) {
+      grad_op->outputs_.emplace_back(name + OperatorBase::GRAD_VAR_SUFFIX());
+    }
   }
 
-  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
-    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
-    return op_checkers_;
-  };
+  static void GenerateGradArgOffset(OperatorPtr op, OperatorPtr grad_op) {
+    VarIndexMap* grad_varmap = new VarIndexMap();
+    const OpProto& op_proto = protos()[op->type_];
+    int idx = 0;
+    // offset of op's inputs
+    for (const auto& var : op_proto.inputs()) {
+      (*grad_varmap)[var.name()] = idx++;
+    }
+    // offset of op's outputs
+    for (const auto& var : op_proto.outputs()) {
+      (*grad_varmap)[var.name()] = idx++;
+    }
+    // offset of gradients of op's output
+    for (const auto& var : op_proto.outputs()) {
+      (*grad_varmap)[var.name() + OperatorBase::GRAD_VAR_SUFFIX()] = idx++;
+    }
+    idx = 0;
+    // offset of gradients of op's input
+    for (const auto& var : op_proto.inputs()) {
+      (*grad_varmap)[var.name() + OperatorBase::GRAD_VAR_SUFFIX()] = idx++;
+    }
+    grad_op->in_out_idxs_.reset(grad_varmap);
+  }
 
-  static std::unordered_map<std::string, OpCreator>& grad_creators() {
-    static std::unordered_map<std::string, OpCreator> grad_creators_;
-    return grad_creators_;
+  static void GenerateGradAttr(OperatorPtr op, OperatorPtr grad_op) {
+    const OpProto& op_proto = protos()[op->type_];
+    grad_op->attrs_ = op->attrs_;
+    grad_op->attrs_.erase("input_format");
+    grad_op->attrs_.erase("output_format");
+    bool has_in_format = op->attrs_.count("input_format");
+    bool has_out_format = op->attrs_.count("output_format");
+    // grad_op's inputs_ contains op's inputs_, outputs_ and gradients of
+    // outpus_. So grad_op's input_format is necessary when op has
+    // either input_format or output_format.
+    if (has_in_format || has_out_format) {
+      std::vector<int> old_in_format;
+      std::vector<int> old_out_format;
+      has_in_format
+          ? old_in_format = op->GetAttr<std::vector<int>>("input_format")
+          : old_in_format = std::vector<int>(op_proto.inputs_size()),
+            std::iota(old_in_format.begin(), old_in_format.end(), 0);
+      has_out_format
+          ? old_out_format = op->GetAttr<std::vector<int>>("output_format")
+          : old_out_format = std::vector<int>(op_proto.outputs_size()),
+            std::iota(old_out_format.begin(), old_out_format.end(), 0);
+
+      std::vector<int> in_format;
+      in_format.reserve(old_in_format.size() + old_out_format.size() * 2);
+      int base = 0;
+      for (const int& idx : old_in_format) {
+        in_format.emplace_back(idx + base);
+      }
+      base += op->inputs_.size();
+      for (const int& idx : old_out_format) {
+        in_format.emplace_back(idx + base);
+      }
+      base += op->outputs_.size();
+      for (const int& idx : old_in_format) {
+        in_format.emplace_back(idx + base);
+      }
+      grad_op->attrs_["input_format"] = in_format;
+      // grad_op's outputs_ contains gradients of op's inputs_. So grad_op's
+      // output_format is necessary only when op has input_format.
+      if (has_in_format) {
+        std::vector<int> out_format;
+        out_format.reserve(op_proto.inputs_size());
+        std::copy(old_in_format.begin(), old_in_format.end(),
+                  std::back_inserter(out_format));
+        grad_op->attrs_["output_format"] = out_format;
+      }
+    }
   }
 };
 
@@ -370,7 +462,7 @@ class GradOpRegisterHelper {
   int __op_register_##__op_type##_handle__() { return 0; }
 
 /**
- * Macro to Register Operator.
+ * Macro to Register Gradient Operator.
  */
 #define REGISTER_GRADIENT_OP(__op_type, __op_class)            \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 5f046d6293..31d7b2575c 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -63,6 +63,11 @@ class OperatorBase {
   /// but it will be convert to a unique name in scope after OpCreator.
   static std::string TMP_VAR_NAME() { return "@TEMP@"; }
 
+  /// If a variable's name has a certain suffix, it means that the
+  /// variable is the gradient of another varibale.
+  /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
+  static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
+
   virtual ~OperatorBase() {}
 
   template <typename T>

From f7390d115cb0660a329d171d827d3685797ddf52 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 19 Jul 2017 14:15:51 +0800
Subject: [PATCH 028/100] delete useless .h header in DepthwiseConvOpGpu.cu

---
 paddle/function/DepthwiseConvOpGpu.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 28e6aa4a01..ede0d27aa8 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
 #include "DepthwiseConvOp.h"
 #include "GemmFunctor.h"
 #include "paddle/math/BaseMatrix.h"

From cb95587feb6e32c8595d02e76e58aa69a96b5035 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 19 Jul 2017 14:28:29 +0800
Subject: [PATCH 029/100] "ignore some gradient of specific op"

---
 paddle/framework/op_proto.proto |  6 ++++++
 paddle/framework/op_registry.h  | 16 ++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto
index 596b8588e7..366c84e53d 100644
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@@ -84,6 +84,11 @@ message VarProto {
     //   "temporary_index": [1]
     // }
     optional bool temporary = 4 [default=false];
+
+    // The gradient of operator can be ignored immediately
+    // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
+    // can be ignored for the future optimized on graph.
+    optional bool ignore_gradient = 6;
 }
 
 // Op protocol message for 3rd-party language binding.
@@ -105,4 +110,5 @@ message OpProto {
 
     // The type of that Op.
     required string type = 5;
+
 }
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 6ba0784f1b..dded0ad33d 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -74,25 +74,29 @@ class OpProtoAndCheckerMaker {
 
  protected:
   void AddInput(const std::string& name, const std::string& comment,
-                bool multiple = false) {
+                bool multiple = false, bool ignore_gradient = false) {
     auto input = proto_->mutable_inputs()->Add();
     *input->mutable_name() = name;
     *input->mutable_comment() = comment;
+    *input->set_ignore_gradient(ignore_gradient);
     input->set_multiple(multiple);
     if (multiple) {
       SetHasMultipleInput();
     }
   }
 
-  void AddInputs(const std::string& name, const std::string& comment) {
-    AddInput(name, comment, true);
+  void AddInputs(const std::string& name, const std::string& comment,
+                 bool ignore_gradient = false) {
+    AddInput(name, comment, true, ignore_gradient);
   }
 
   void AddOutput(const std::string& name, const std::string& comment,
-                 bool temporary = false, bool multiple = false) {
+                 bool temporary = false, bool multiple = false,
+                 bool ignore_gradient = false) {
     auto output = proto_->mutable_outputs()->Add();
     *output->mutable_name() = name;
     *output->mutable_comment() = comment;
+    *output->set_ignore_gradient(ignore_gradient);
     output->set_multiple(multiple);
     if (multiple) {
       SetHasMultipleOutput();
@@ -104,8 +108,8 @@ class OpProtoAndCheckerMaker {
   }
 
   void AddOutputs(const std::string& name, const std::string& comment,
-                  bool temporary = false) {
-    AddOutput(name, comment, temporary, true);
+                  bool temporary = false, bool ignore_gradient = false) {
+    AddOutput(name, comment, temporary, true, ignore_gradient);
   }
 
   template <typename T>

From 94a6b1fa43f9e47b52bb6bb4841ffbc4f9bdfbfc Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 19 Jul 2017 14:29:02 +0800
Subject: [PATCH 030/100] rename a macro

---
 paddle/framework/op_registry.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 6ba0784f1b..53d7ea7d9d 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -466,7 +466,7 @@ class GradOpRegisterHelper {
  */
 #define REGISTER_GRADIENT_OP(__op_type, __op_class)            \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
-      __reg_op__##__op_type,                                   \
+      __reg_gradient_op_##__reg_op__##__op_type,               \
       "REGISTER_GRADIENT_OP must be in global namespace");     \
   static ::paddle::framework::GradOpRegisterHelper<__op_class> \
       __op_register_##__op_type##__(#__op_type);               \

From 8bc48921061ef5fee489f39601d985b525dc3784 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Wed, 19 Jul 2017 14:40:31 +0800
Subject: [PATCH 031/100] "fix comment "

---
 paddle/framework/op_registry.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index dded0ad33d..92354f4ffd 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -78,7 +78,7 @@ class OpProtoAndCheckerMaker {
     auto input = proto_->mutable_inputs()->Add();
     *input->mutable_name() = name;
     *input->mutable_comment() = comment;
-    *input->set_ignore_gradient(ignore_gradient);
+    input->set_ignore_gradient(ignore_gradient);
     input->set_multiple(multiple);
     if (multiple) {
       SetHasMultipleInput();
@@ -96,7 +96,7 @@ class OpProtoAndCheckerMaker {
     auto output = proto_->mutable_outputs()->Add();
     *output->mutable_name() = name;
     *output->mutable_comment() = comment;
-    *output->set_ignore_gradient(ignore_gradient);
+    output->set_ignore_gradient(ignore_gradient);
     output->set_multiple(multiple);
     if (multiple) {
       SetHasMultipleOutput();

From 14cfb8c262c1f16c8916087c8dc4ce2d16500c7e Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 19 Jul 2017 08:22:21 +0000
Subject: [PATCH 032/100] fix gpu build error

---
 cmake/flags.cmake                 |  1 +
 paddle/operators/mul_op.h         |  7 ++++---
 paddle/operators/rowwise_add_op.h |  1 +
 paddle/operators/softmax_op.h     | 21 ++++++++++++---------
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index c31e62fc08..34fd348893 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -124,6 +124,7 @@ set(GPU_COMMON_FLAGS
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
+    -Wno-error=array-bounds # Warnings in Eigen::array
 )
 
 if (APPLE)
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 13e5b6a950..81d5953cf0 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -24,9 +24,10 @@ template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
 public:
   void Compute(const framework::KernelContext& context) const override {
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
-    dim_pair[0].first = 1;
-    dim_pair[0].second = 0;
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
+        Eigen::IndexPair<Eigen::DenseIndex>(1, 0)};
+    // dim_pair[0].first = 1;
+    // dim_pair[0].second = 0;
 
     auto input0 = context.Input(0)->Get<framework::Tensor>();
     auto input1 = context.Input(1)->Get<framework::Tensor>();
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index f1d43002dc..dd5cde0c5d 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -26,6 +26,7 @@ public:
     auto in0 = context.Input(0)->Get<framework::Tensor>();
     auto in1 = context.Input(1)->Get<framework::Tensor>();
     auto* out = context.Output(0)->GetMutable<framework::Tensor>();
+    out->mutable_data<T>(context.GetPlace());
 
     auto input = in0.matrix<T>();
     auto bias = in1.vec<T>();
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 34a6c299bb..6d675ea5f6 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -26,6 +26,7 @@ public:
   void Compute(const framework::KernelContext& context) const override {
     auto input = context.Input(0)->Get<framework::Tensor>();
     auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+    output->mutable_data<T>(context.GetPlace());
 
     auto logits = input.matrix<T>();
     auto softmax = output->matrix<T>();
@@ -40,19 +41,21 @@ public:
     Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
     Eigen::DSizes<int, 2> one_by_class(1, num_classes);
 
-    auto shifted_logits = (logits - logits.maximum(along_class)
-                                        .eval()
-                                        .reshape(batch_by_one)
-                                        .broadcast(one_by_class));
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
 
     softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
 
     softmax.device(*(context.GetEigenDevice<Place>())) =
-        (softmax * softmax.sum(along_class)
-                       .inverse()
-                       .eval()
-                       .reshape(batch_by_one)
-                       .broadcast(one_by_class));
+        (softmax *
+         softmax.sum(along_class)
+             .inverse()
+             .eval()
+             .reshape(batch_by_one)
+             .broadcast(one_by_class));
   }
 };
 }  // namespace operators

From 55d301722fac0454e7769e4b16d77aa9ab907042 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 19 Jul 2017 16:41:11 +0800
Subject: [PATCH 033/100] Simplify Tensor implimentation

ATTENTION: some interfaces changed:
1. void Tensor::set_dims(const DDim& dims) ==> void Tensor::Resize(const DDim& dims).
2. void Tensor::ShareDataFrom(const Tensor& src)  ==> void Tensor::ShareDataWith(const Tensor& src)
3. DDim Tensor::dims() const ==> const DDim& Tensor::dims() const
---
 paddle/framework/tensor.h          | 65 +++++++++++-------------------
 paddle/framework/tensor_test.cc    | 10 ++---
 paddle/memory/memory.h             | 10 +++++
 paddle/operators/add_op.cc         |  2 +-
 paddle/operators/mul_op.cc         |  2 +-
 paddle/operators/rowwise_add_op.cc |  2 +-
 paddle/operators/sigmoid_op.cc     |  2 +-
 paddle/operators/softmax_op.cc     |  2 +-
 paddle/pybind/pybind.cc            |  2 +-
 paddle/pybind/tensor_bind.h        |  2 +-
 10 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 1dd421cdb6..a0f0bb1ffd 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -40,21 +40,21 @@ class Tensor {
 
   template <typename T>
   const T* data() const {
-    CheckDims<T>();
+    EnforceSufficientMemory<T>();
     return reinterpret_cast<const T*>(
         reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
   }
 
   template <typename T>
   T* raw_data() const {
-    CheckDims<T>();
+    EnforceSufficientMemory<T>();
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
   }
 
   template <typename T>
   T* mutable_data(DDim dims, platform::Place place) {
-    set_dims(dims);
+    Resize(dims);
     return mutable_data<T>(place);
   }
 
@@ -147,11 +147,9 @@ class Tensor {
   }
 
   template <typename T>
-  void ShareDataFrom(const Tensor& src) {
-    src.CheckDims<T>();
-    holder_ = src.holder_;
-    set_dims(src.dims());
-    offset_ = src.offset_;
+  void ShareDataWith(const Tensor& src) {
+    src.EnforceSufficientMemory<T>();
+    *this = src;
   }
 
   template <typename T>
@@ -159,9 +157,9 @@ class Tensor {
     PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                        platform::is_cpu_place(dst_place),
                    "Tensor::CopyFrom only support CPU now.");
-    src.CheckDims<T>();
+    src.EnforceSufficientMemory<T>();
     size_t size = product(src.dims_) * sizeof(T);
-    set_dims(src.dims());
+    Resize(src.dims());
     const void* src_ptr = static_cast<const void*>(src.data<T>());
     void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
     memcpy(dst_ptr, src_ptr, size);
@@ -169,34 +167,25 @@ class Tensor {
 
   template <typename T>
   Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    CheckDims<T>();
-    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
-                   "Slice index is less than zero or out of bound.");
+    EnforceSufficientMemory<T>();
+    PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
+    PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
     PADDLE_ENFORCE(begin_idx < end_idx,
                    "Begin index must be less than end index.");
     PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
-    std::vector<int> d = vectorize(dims_);
-    int base = 1;
-    for (size_t i = 1; i < d.size(); ++i) {
-      base *= d[i];
-    }
+    int base = product(dims_) / dims_[0];
     Tensor dst;
     dst.holder_ = holder_;
     DDim dst_dims = dims_;
     dst_dims[0] = end_idx - begin_idx;
-    dst.set_dims(dst_dims);
+    dst.Resize(dst_dims);
     dst.offset_ = offset_ + begin_idx * base * sizeof(T);
     return dst;
   }
 
-  void set_dims(const DDim& dims) {
-    if (dims == dims_) {
-      return;
-    }
-    dims_ = dims;
-  }
+  void Resize(const DDim& dims) { dims_ = dims; }
 
-  DDim dims() const { return dims_; }
+  const DDim& dims() const { return dims_; }
 
  private:
   // Placeholder hides type T, so it doesn't appear as a template
@@ -211,21 +200,9 @@ class Tensor {
 
   template <typename T, typename PlaceType>
   struct PlaceholderImpl : public Placeholder {
-   private:
-    template <typename PType>
-    class Deleter {
-     public:
-      Deleter(PType place) : place_(place) {}
-      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
-
-     private:
-      PType place_;
-    };
-
-   public:
     PlaceholderImpl(PlaceType place, size_t size)
         : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               Deleter<PlaceType>(place)),
+               memory::PodDeleter<T, PlaceType>(place)),
           place_(place),
           size_(size) {}
 
@@ -234,13 +211,13 @@ class Tensor {
     virtual paddle::platform::Place place() const { return place_; }
     virtual std::type_index type() const { return std::type_index(typeid(T)); }
 
-    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
+    std::unique_ptr<T, memory::PodDeleter<T, PlaceType>> ptr_;
     platform::Place place_;  // record the place of ptr_.
     size_t size_;            // size of the memory block.
   };
 
   template <typename T>
-  inline void CheckDims() const {
+  inline void EnforceSufficientMemory() const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "Tenosr holds no memory. Call Tensor::mutable_data first.");
     PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
@@ -250,7 +227,11 @@ class Tensor {
 
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
   DDim dims_;
-  size_t offset_;  // marks the begin of tensor data area.
+  // A PlaceHolder may be shared by more than one tensor. Some of them may be
+  // slices of the others. So the offset_ is introduced here to indicate the
+  // byte offset between PlaceHolder::ptr_ and where tensor's data really
+  // begins.
+  size_t offset_;
   template <bool less, size_t i, typename... args>
   friend struct paddle::pybind::details::CastToPyBufferImpl;
 };
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 84c6f0cf65..a78bdd41b4 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -19,7 +19,7 @@ TEST(Tensor, Dims) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   Tensor tt;
-  tt.set_dims(make_ddim({2, 3, 4}));
+  tt.Resize(make_ddim({2, 3, 4}));
   DDim dims = tt.dims();
   ASSERT_EQ(arity(dims), 3);
   for (int i = 0; i < 3; ++i) {
@@ -97,7 +97,7 @@ TEST(Tensor, MutableData) {
 #endif
 }
 
-TEST(Tensor, ShareDataFrom) {
+TEST(Tensor, ShareDataWith) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   {
@@ -106,7 +106,7 @@ TEST(Tensor, ShareDataFrom) {
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
-      dst_tensor.ShareDataFrom<float>(src_tensor);
+      dst_tensor.ShareDataWith<float>(src_tensor);
     } catch (EnforceNotMet err) {
       caught = true;
       std::string msg =
@@ -119,7 +119,7 @@ TEST(Tensor, ShareDataFrom) {
     ASSERT_TRUE(caught);
 
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataFrom<int>(src_tensor);
+    dst_tensor.ShareDataWith<int>(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
@@ -128,7 +128,7 @@ TEST(Tensor, ShareDataFrom) {
     Tensor src_tensor;
     Tensor dst_tensor;
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom<int>(src_tensor);
+    dst_tensor.ShareDataWith<int>(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 #endif
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 2d6f4fd2a0..f5890fb844 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -28,5 +28,15 @@ void Free(Place, void*);
 template <class Place>
 size_t Used(Place);
 
+template <typename T, typename PlaceType>
+class PodDeleter {
+ public:
+  PodDeleter(PlaceType place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  PlaceType place_;
+};
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 41d044cdb7..858a650899 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -31,7 +31,7 @@ protected:
         "Inputs/Outputs of AddOp must all be set");
     PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
                    "Two input of Add Op's dimension must be same.");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
   }
 };
 
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 713b2a5dc8..e7bda6a7df 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -33,7 +33,7 @@ protected:
         dim0[1] == dim1[0],
         "First matrix's width must be equal with second matrix's height.");
     PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
-    outputs[0]->set_dims({dim0[0], dim1[1]});
+    outputs[0]->Resize({dim0[0], dim1[1]});
   }
 };
 
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 414bafd046..97d42c1934 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -30,7 +30,7 @@ protected:
     PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
     PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
     PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
   }
 };
 
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 45ae277c53..e87fb78d3f 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -24,7 +24,7 @@ protected:
       const std::vector<framework::Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
     PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
   }
 };
 
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 4ca7be359e..07302d67d9 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -25,7 +25,7 @@ protected:
     PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
     PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
 
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
   }
 };
 
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index fc9c6544c3..56d6fe4dd2 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -42,7 +42,7 @@ PYBIND11_PLUGIN(core) {
            [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
       .def("set_dims",
            [](pd::Tensor& self, const std::vector<int>& dim) {
-             self.set_dims(pd::make_ddim(dim));
+             self.Resize(pd::make_ddim(dim));
            })
       .def("alloc_float",
            [](pd::Tensor& self) {
diff --git a/paddle/pybind/tensor_bind.h b/paddle/pybind/tensor_bind.h
index b96516643a..995e102bf9 100644
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
@@ -86,7 +86,7 @@ void PyTensorSetFromArray(
     dims.push_back((int)array.shape()[i]);
   }
 
-  self.set_dims(framework::make_ddim(dims));
+  self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }

From 77ff97ab386257ecc064ca7aec87f7e27cc8886d Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 19 Jul 2017 20:07:53 +0800
Subject: [PATCH 034/100] fuse interface of depthwise to expand in python api

---
 python/paddle/trainer/config_parser.py        |  50 ------
 .../paddle/trainer_config_helpers/layers.py   | 159 ------------------
 2 files changed, 209 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index ec83694845..7190f0e8c9 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1799,56 +1799,6 @@ class ParameterReluLayer(LayerBase):
         self.create_input_parameter(0, input_layer.size / partial_sum)
 
 
-@config_layer('depthwise_conv')
-class DepthwiseConvLayer(LayerBase):
-    layer_type = 'depthwise_conv'
-
-    def __init__(self,
-                 name,
-                 inputs=[],
-                 bias=True,
-                 num_filters=None,
-                 shared_biases=False,
-                 **xargs):
-        super(DepthwiseConvLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-
-        use_gpu = int(g_command_config_args.get("use_gpu", 0))
-        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
-
-        self.layer_type = "depthwise_conv"
-        # need to specify layer in config
-        self.config.type = self.layer_type
-
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            conv_conf = self.config.inputs[input_index].conv_conf
-            #set the groups, the groups equals the input channels
-            self.inputs[input_index].conv.groups = self.inputs[
-                input_index].conv.channels
-            parse_conv(self.inputs[input_index].conv, input_layer.name,
-                       conv_conf, num_filters)
-            psize = self.calc_parameter_size(conv_conf)
-            self.create_input_parameter(input_index, psize)
-            self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x,
-                               self.config.num_filters)
-
-        psize = self.config.size
-        if shared_biases:
-            psize = self.config.num_filters
-        self.create_bias_parameter(bias, psize, [psize, 1])
-
-    def calc_parameter_size(self, conv_conf):
-        return self.config.num_filters * conv_conf.filter_channels \
-                    * (conv_conf.filter_size * conv_conf.filter_size_y)
-
-
 @config_layer('conv')
 class ConvLayerBase(LayerBase):
     layer_type = 'conv'
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b455da3d4b..78aa0778f8 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -57,7 +57,6 @@ __all__ = [
     'classification_cost',
     'LayerOutput',
     'img_conv_layer',
-    'img_depthwise_conv_layer',
     'img_pool_layer',
     'batch_norm_layer',
     'img_cmrnorm_layer',
@@ -152,7 +151,6 @@ class LayerType(object):
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = 'conv'
     CONVTRANS_LAYER = 'convt'
-    DEPTHWISE_CONV_LAYER = 'depthwise_conv'
     EXCONV_LAYER = 'exconv'
     EXCONVTRANS_LAYER = 'exconvt'
     CUDNNCONV_LAYER = 'cudnn_conv'
@@ -2259,163 +2257,6 @@ def hsigmoid(input,
         name, LayerType.HSIGMOID, parents=parents, size=l.config.size)
 
 
-@wrap_name_default("depthwise_conv")
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default(act=ReluActivation())
-@layer_support(DROPOUT)
-def img_depthwise_conv_layer(input,
-                             filter_size,
-                             num_filters,
-                             name=None,
-                             num_channels=None,
-                             act=None,
-                             stride=1,
-                             padding=0,
-                             bias_attr=None,
-                             param_attr=None,
-                             shared_biases=True,
-                             layer_attr=None,
-                             filter_size_y=None,
-                             stride_y=None,
-                             padding_y=None,
-                             trans=False,
-                             layer_type=None):
-    """
-    DepthwiseConvolution layer for image. 
-
-    The details of depthwise convolution layer, please refer 
-    https://arxiv.org/abs/1704.04861
-    
-	The Depthwise Convolution layer must meet this requirement that the groups equals to the
-	inputChannels. And the groups must be divisible by outputChannels.
-	So the filter shape will be (groups, outputChannels/groups, 1, filter_size, filter_size_y)
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        conv = img_depthwise_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                              num_channels=8,
-                              num_filters=16, stride=1,
-                              bias_attr=False,
-                              act=ReluActivation())
-
-    :param name: Layer name.
-    :type name: basestring
-    :param input: Layer Input.
-    :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
-                        two image dimension.
-    :type filter_size: int|tuple|list
-    :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
-                        currently supports rectangular filters, the filter's
-                        shape will be (filter_size, filter_size_y).
-    :type filter_size_y: int|None
-    :param num_filters: Each filter group's number of filter
-    :param act: Activation type. Default is tanh
-    :type act: BaseActivation
-    :param stride: The x dimension of the stride. Or input a tuple for two image
-                   dimension.
-    :type stride: int|tuple|list
-    :param stride_y: The y dimension of the stride.
-    :type stride_y: int
-    :param padding: The x dimension of the padding. Or input a tuple for two
-                    image dimension
-    :type padding: int|tuple|list
-    :param padding_y: The y dimension of the padding.
-    :type padding_y: int
-    :param bias_attr: DepthwiseConvolution bias attribute. None means default bias.
-                      False means no bias.
-    :type bias_attr: ParameterAttribute|False
-    :param num_channels: number of input channels. If None will be set
-                        automatically from previous output.
-    :type num_channels: int
-    :param param_attr: DepthwiseConvolution param attribute. None means default attribute
-    :type param_attr: ParameterAttribute
-    :param shared_biases: Is biases will be shared between filters or not.
-    :type shared_biases: bool
-    :param layer_attr: Layer Extra Attribute.
-    :type layer_attr: ExtraLayerAttribute
-    :param trans: true if it is a convTransLayer, false if it is a convLayer
-    :type trans: bool
-    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or
-                       "cudnn_conv"
-    :type layer_type: String
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    # the groups in depthwise conv should be equal to input channels.
-    groups = num_channels
-
-    if filter_size_y is None:
-        if isinstance(filter_size, collections.Sequence):
-            assert len(filter_size) == 2
-            filter_size, filter_size_y = filter_size
-        else:
-            filter_size_y = filter_size
-
-    if stride_y is None:
-        if isinstance(stride, collections.Sequence):
-            assert len(stride) == 2
-            stride, stride_y = stride
-        else:
-            stride_y = stride
-
-    if padding_y is None:
-        if isinstance(padding, collections.Sequence):
-            assert len(padding) == 2
-            padding, padding_y = padding
-        else:
-            padding_y = padding
-
-    if param_attr.attr.get('initial_smart'):
-        # special initial for conv layers.
-        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
-        param_attr.attr["initial_mean"] = 0.0
-        param_attr.attr["initial_std"] = init_w
-        param_attr.attr["initial_strategy"] = 0
-        param_attr.attr["initial_smart"] = False
-
-    lt = LayerType.DEPTHWISE_CONV_LAYER
-
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            conv=Conv(
-                filter_size=filter_size,
-                padding=padding,
-                stride=stride,
-                channels=num_channels,
-                groups=groups,
-                filter_size_y=filter_size_y,
-                padding_y=padding_y,
-                stride_y=stride_y),
-            **param_attr.attr),
-        active_type=act.name,
-        num_filters=num_filters,
-        bias=ParamAttr.to_bias(bias_attr),
-        shared_biases=shared_biases,
-        type=lt,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        lt,
-        parents=[input],
-        activation=act,
-        num_filters=num_filters,
-        size=l.config.size)
-
-
 @wrap_name_default("conv")
 @wrap_param_attr_default()
 @wrap_bias_attr_default()

From 81998868f0b65b6d73c019a79c3a9e64f54f8f64 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 19 Jul 2017 20:09:37 +0800
Subject: [PATCH 035/100] fuse interface of depthwise to expandconv

---
 paddle/gserver/layers/ConvBaseLayer.cpp      |  3 +-
 paddle/gserver/layers/DepthwiseConvLayer.cpp | 60 --------------------
 paddle/gserver/layers/DepthwiseConvLayer.h   | 40 -------------
 paddle/gserver/layers/ExpandConvLayer.cpp    | 20 ++++++-
 4 files changed, 18 insertions(+), 105 deletions(-)
 delete mode 100644 paddle/gserver/layers/DepthwiseConvLayer.cpp
 delete mode 100644 paddle/gserver/layers/DepthwiseConvLayer.h

diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 765c627c30..e161d89c38 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -21,8 +21,7 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
   Layer::init(layerMap, parameterMap);
-  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv" ||
-               config_.type() == "depthwise_conv")
+  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
                   ? false
                   : true;
 
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.cpp b/paddle/gserver/layers/DepthwiseConvLayer.cpp
deleted file mode 100644
index 4b5f16d76b..0000000000
--- a/paddle/gserver/layers/DepthwiseConvLayer.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(depthwise_conv, DepthwiseConvLayer);
-
-bool DepthwiseConvLayer::init(const LayerMap &layerMap,
-                              const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ExpandConvBaseLayer::init(layerMap, parameterMap);
-
-  size_t numInputs = config_.inputs_size();
-  inputShape_.resize(numInputs);
-  filterShape_.resize(numInputs);
-  outputShape_.resize(numInputs);
-
-  for (int i = 0; i < config_.inputs_size(); i++) {
-    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
-    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    createFunction(forward_,
-                   "DepthwiseConv",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
-
-    createFunction(backward_,
-                   "DepthwiseConvGradInput",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
-
-    createFunction(backward_,
-                   "DepthwiseConvGradFilter",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
-  }
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DepthwiseConvLayer.h b/paddle/gserver/layers/DepthwiseConvLayer.h
deleted file mode 100644
index 1b154bd99d..0000000000
--- a/paddle/gserver/layers/DepthwiseConvLayer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ExpandConvLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer does the depthwise convolution calculation of mobilenet.
- * The config file api is img_depthwise_conv_layer.
- */
-
-class DepthwiseConvLayer : public ExpandConvLayer {
-public:
-  explicit DepthwiseConvLayer(const LayerConfig& config)
-      : ExpandConvLayer(config) {}
-
-  ~DepthwiseConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index af79e65a7c..224ef0d51b 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -38,10 +38,24 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
   inputShape_.resize(numInputs);
   filterShape_.resize(numInputs);
   outputShape_.resize(numInputs);
+
+  string convType;
+  string convGradInputType;
+  string convGradFilterType;
+
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
 
+    if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
+      convType = "DepthwiseConv" convGradInputType =
+          "DepthwiseConvGradInput" convGradFilterType =
+              "DepthwiseConvGradFilter"
+    } else {
+      convType = "GemmConv" convGradInputType =
+          "GemmConvGradInput" convGradFilterType = "GemmConvGradFilter"
+    }
+
     if (FLAGS_use_nnpack) {
       CHECK_EQ(isDeconv_, false);
       createFunction(forward_,
@@ -53,21 +67,21 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                          .set("algo", std::string("auto")));
     } else {
       createFunction(forward_,
-                     !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                     !isDeconv_ ? convType : convGradInputType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
-                     !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                     !isDeconv_ ? convGradInputType : convType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
-                     "GemmConvGradFilter",
+                     convGradFilterType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)

From f490d94210d29e8474319c87ed1fb6cacda63693 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 19 Jul 2017 21:51:22 +0800
Subject: [PATCH 036/100] separate MKL_LITE from MKLDNN

---
 CMakeLists.txt               | 11 +++++--
 cmake/cblas.cmake            | 17 +++++++++-
 cmake/configure.cmake        | 22 +++++++++++--
 cmake/external/mkldnn.cmake  | 44 ++++++++++----------------
 cmake/external/mkllite.cmake | 61 ++++++++++++++++++++++++++++++++++++
 5 files changed, 121 insertions(+), 34 deletions(-)
 create mode 100644 cmake/external/mkllite.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e664d1415..fedf5db0b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ON)
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKL_LITE    "Compile PaddlePaddle with mkl lite package."   ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -76,6 +77,10 @@ if(ANDROID)
         "Disable PYTHON when cross-compiling for Android" FORCE)
     set(WITH_RDMA OFF CACHE STRING
         "Disable RDMA when cross-compiling for Android" FORCE)
+    set(WITH_MKLDNN OFF CACHE STRING
+        "Disable MKLDNN when cross-compiling for Android" FORCE)
+    set(WITH_MKL_LITE OFF CACHE STRING
+        "Disable MKL lite package when cross-compiling for Android" FORCE)
 endif(ANDROID)
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -89,14 +94,15 @@ endif()
 
 ########################################################################################
 
+include(external/mkllite)   # download mkl minimal lite package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
 include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
-include(external/mkldnn)    # download, build, install mkldnn
 include(external/openblas)  # download, build, install openblas
+include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
@@ -139,7 +145,6 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 if(WITH_MKLDNN)
-    message(STATUS "MKLDNN_LIBRARY: ${MKLDNN_LIBRARY}")
     list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKL_LITE_LIB_IOMP})
 endif()
 
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index ee654e64bd..52556b1b40 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -15,7 +15,22 @@
 
 set(CBLAS_FOUND OFF)
 
-## Find MKL First.
+## Find MKL Lite First.
+if(WITH_MKL_LITE AND MKL_LITE_INC_DIR AND MKL_LITE_LIB)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER MKL_LITE)
+  set(CBLAS_INC_DIR ${MKL_LITE_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKL_LITE_LIB})
+
+  add_definitions(-DPADDLE_USE_MKL_LITE)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found cblas and lapack in MKL Lite "
+    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  return()
+endif()
+
+## Then find MKL.
 set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 8719197682..37eececfd5 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -69,8 +69,26 @@ endif(NOT WITH_GPU)
 
 if(WITH_MKLDNN)
     add_definitions(-DPADDLE_USE_MKLDNN)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    if (WITH_MKL_LITE AND MKLDNN_IOMP_DIR)
+        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
+        set(OPENMP_FLAGS "-fopenmp")
+        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+    else()
+        find_package(OpenMP)
+        if(OPENMP_FOUND)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        else()
+            message(WARNING "Can not find OpenMP."
+                 "Some performance features in MKLDNN may not be available")
+        endif()
+    endif()
+
 endif(WITH_MKLDNN)
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 834f5ae230..28a753e19a 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -23,10 +23,6 @@ SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
-# The following magic numbers should be updated regularly to keep latest version
-SET(MKLDNN_TAG "v0.9")
-SET(MKLDNN_MKL_VER "mklml_lnx_2018.0.20170425")
-
 IF(WIN32)
     MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
       "Force WITH_MKLDNN=OFF")
@@ -42,37 +38,29 @@ ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
 
-SET(MKLDNN_CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-SET(MKLDNN_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+IF(${CBLAS_PROVIDER} STREQUAL "MKL_LITE")
+    SET(MKLDNN_DEPENDS   ${MKL_LITE_PROJECT})
+    SET(MKLDNN_MKLROOT   ${MKL_LITE_ROOT})
+    SET(MKLDNN_IOMP_DIR  ${MKL_LITE_LIB_DIR})
+ENDIF()
 
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY    "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG           "${MKLDNN_TAG}"
-    PREFIX            ${MKLDNN_SOURCES_DIR}
-    PATCH_COMMAND     cd <SOURCE_DIR>/scripts && ./prepare_mkl.sh
-    UPDATE_COMMAND    ""
-    CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS        -DCMAKE_CXX_FLAGS=${MKLDNN_CMAKE_CXX_FLAGS}
-    CMAKE_ARGS        -DCMAKE_C_FLAGS=${MKLDNN_CMAKE_C_FLAGS}
-    CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS        -DCMAKE_INSTALL_LIBDIR=${MKLDNN_INSTALL_DIR}/lib
-    CMAKE_ARGS        -DCMAKE_BUILD_TYPE=Release
-    CMAKE_CACHE_ARGS  -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                      -DCMAKE_INSTALL_LIBDIR:PATH=${MKLDNN_INSTALL_DIR}/lib
-                      -DCMAKE_BUILD_TYPE:STRING=Release
+    DEPENDS             ${MKLDNN_DEPENDS}
+    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_TAG             "v0.9"
+    PREFIX              ${MKLDNN_SOURCES_DIR}
+    CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
+    BUILD_COMMAND       cd <SOURCE_DIR>/build
+                        && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
+                        && make all -j${CPU_CORES}
+    INSTALL_COMMAND     cd <SOURCE_DIR>/build && make install
+    UPDATE_COMMAND      ""
 )
 
-SET(MKL_LITE_DIR ${MKLDNN_SOURCES_DIR}/src/${MKLDNN_PROJECT}/external/${MKLDNN_MKL_VER})
-SET(MKL_LITE_INC_DIR ${MKL_LITE_DIR}/include)
-SET(MKL_LITE_LIB ${MKL_LITE_DIR}/lib/libmklml_intel.so)
-SET(MKL_LITE_LIB_IOMP ${MKL_LITE_DIR}/lib/libiomp5.so)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKL_LITE_DIR}/lib")
-
 ADD_LIBRARY(mkldnn STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-
+MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
 LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/mkllite.cmake b/cmake/external/mkllite.cmake
new file mode 100644
index 0000000000..e889290e36
--- /dev/null
+++ b/cmake/external/mkllite.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKL_LITE})
+  return()
+ENDIF(NOT ${WITH_MKL_LITE})
+
+INCLUDE(ExternalProject)
+
+SET(MKL_LITE_PROJECT       "extern_mkllite")
+SET(MKL_LITE_VER           "mklml_lnx_2018.0.20170425")
+SET(MKL_LITE_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKL_LITE_VER}.tgz")
+SET(MKL_LITE_DOWNLOAD_DIR  ${THIRD_PARTY_PATH}/mkllite)
+
+SET(MKL_LITE_ROOT          ${MKL_LITE_DOWNLOAD_DIR}/${MKL_LITE_VER})
+SET(MKL_LITE_INC_DIR       ${MKL_LITE_ROOT}/include)
+SET(MKL_LITE_LIB_DIR       ${MKL_LITE_ROOT}/lib)
+SET(MKL_LITE_LIB           ${MKL_LITE_LIB_DIR}/libmklml_intel.so)
+SET(MKL_LITE_IOMP_LIB      ${MKL_LITE_LIB_DIR}/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH    "${CMAKE_INSTALL_RPATH}" "${MKL_LITE_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${MKL_LITE_INC_DIR})
+
+ExternalProject_Add(
+    ${MKL_LITE_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${MKL_LITE_DOWNLOAD_DIR}
+    DOWNLOAD_DIR          ${MKL_LITE_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKL_LITE_URL}
+                          && tar -xzf ${MKL_LITE_DOWNLOAD_DIR}/${MKL_LITE_VER}.tgz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    PATCH_COMMAND         ""
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    INSTALL_COMMAND       ""
+    TEST_COMMAND          ""
+)
+
+IF (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkllite_dummy.c)
+    FILE(WRITE ${dummyfile} "const char * dummy_mkllite = \"${dummyfile}\";")
+    ADD_LIBRARY(mkllite STATIC ${dummyfile})
+ELSE()
+    ADD_LIBRARY(mkllite INTERFACE)
+ENDIF()
+
+ADD_DEPENDENCIES(mkllite ${MKL_LITE_PROJECT})
+
+LIST(APPEND external_project_dependencies mkllite)

From 1601c34aa580cceb9ccc2ca4f1a82a9299f0c887 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 19 Jul 2017 22:17:09 +0800
Subject: [PATCH 037/100] fix ld bug when enable WITH_TESTING

---
 cmake/external/gtest.cmake | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 77e06e983e..8ac68aa325 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -34,9 +34,15 @@ IF(WITH_TESTING)
             "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
     ENDIF(WIN32)
 
+    IF(WITH_MKL_LITE)
+        # wait for mkl downloading completed
+        SET(GTEST_DEPENDS   ${MKL_LITE_PROJECT})
+    ENDIF()
+
     ExternalProject_Add(
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
+        DEPENDS         ${GTEST_DEPENDS}
         GIT_REPOSITORY  "https://github.com/google/googletest.git"
         GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}

From 1f516fa0ef7a29fd79bf92202c553fb41d4a7047 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 19 Jul 2017 22:21:27 +0800
Subject: [PATCH 038/100] modify format, and modify the layer grad test, op
 test

---
 paddle/function/ConvOpTest.cpp            | 383 +++++++++-------------
 paddle/gserver/layers/ExpandConvLayer.cpp |  17 +-
 paddle/gserver/tests/test_LayerGrad.cpp   |  11 +-
 3 files changed, 168 insertions(+), 243 deletions(-)

diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
index 27609fbbd4..c96c8d9eea 100644
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@@ -25,95 +25,89 @@ enum TestType {
   kBackwardFilterTest = 2,
 };
 
-enum LayerType {
-  convolutionType = 0,
-  depthwiseConvolutionType = 1,
-};
-
 template <DeviceType DType1, DeviceType DType2>
 class ConvolutionTest {
 public:
   ConvolutionTest(const std::string& conv1,
                   const std::string& conv2,
-                  LayerType layerType,
                   TestType type,
+                  bool useGroups = true,
                   std::string algo = "auto") {
     for (size_t batchSize : {1, 32}) {
       for (size_t inputSize : {7, 14, 54}) {
         for (size_t filterSize : {1, 3, 5}) {
           for (size_t inputChannels : {3, 64}) {
             for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels > outputChannels) break;
-              if (layerType == depthwiseConvolutionType &&
-                  outputChannels % inputChannels != 0)
-                break;
-
-              size_t groups = 1;
-
-              if (layerType == depthwiseConvolutionType) {
-                groups = inputChannels;
-              }
-
-              for (size_t stride : {1, 2}) {
-                for (size_t padding : {0, 1}) {
-                  if (padding >= filterSize) break;
-                  size_t outputSize =
-                      (inputSize - filterSize + 2 * padding + stride) / stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputSize
-                          << " inputWidth=" << inputSize
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterSize
-                          << " filterWidth=" << filterSize
-                          << " outputHeight=" << outputSize
-                          << " outputWidth=" << outputSize
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", groups)
-                          .set("algo", algo));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputSize, inputSize};
-
-                  TensorShape filter;
-                  if (layerType == depthwiseConvolutionType)
-                    filter = TensorShape({groups,
-                                          outputChannels / groups,
-                                          (size_t)1,
-                                          filterSize,
-                                          filterSize});
-                  else
-                    filter = TensorShape({outputChannels,
-                                          inputChannels,
-                                          filterSize,
-                                          filterSize});
-                  TensorShape output{
-                      batchSize, outputChannels, outputSize, outputSize};
-
-                  if (type == kForwardTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.run();
-                  } else if (type == kBackwardInputTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                    test.run();
-                  } else if (type == kBackwardFilterTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.run();
+              for (size_t groups : {1, 3, 64}) {
+                if (inputChannels > outputChannels) break;
+                if (groups != 1 &&
+                    (inputChannels != groups || outputChannels % groups != 0))
+                  continue;
+                if (!useGroups) groups = 1;
+
+                for (size_t stride : {1, 2}) {
+                  for (size_t padding : {0, 1}) {
+                    if (padding >= filterSize) break;
+                    size_t outputSize =
+                        (inputSize - filterSize + 2 * padding + stride) /
+                        stride;
+                    VLOG(3) << " batchSize=" << batchSize
+                            << " inputChannels=" << inputChannels
+                            << " inputHeight=" << inputSize
+                            << " inputWidth=" << inputSize
+                            << " outputChannels=" << outputChannels
+                            << " filterHeight=" << filterSize
+                            << " filterWidth=" << filterSize
+                            << " outputHeight=" << outputSize
+                            << " outputWidth=" << outputSize
+                            << " stride=" << stride << " padding=" << padding;
+
+                    std::vector<size_t> paddings = {padding, padding};
+                    std::vector<size_t> strides = {stride, stride};
+                    Compare2Function<DType1, DType2> test(
+                        conv1,
+                        conv2,
+                        FuncConfig()
+                            .set("paddings", paddings)
+                            .set("strides", strides)
+                            .set("groups", groups)
+                            .set("algo", algo));
+
+                    TensorShape input{
+                        batchSize, inputChannels, inputSize, inputSize};
+
+                    TensorShape filter;
+                    if (groups > 1)
+                      filter = TensorShape({groups,
+                                            outputChannels / groups,
+                                            inputChannels / groups,
+                                            filterSize,
+                                            filterSize});
+                    else
+                      filter = TensorShape({outputChannels,
+                                            inputChannels,
+                                            filterSize,
+                                            filterSize});
+                    TensorShape output{
+                        batchSize, outputChannels, outputSize, outputSize};
+
+                    if (type == kForwardTest) {
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                      test.run();
+                    } else if (type == kBackwardInputTest) {
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input),
+                                      ADD_TO);
+                      test.run();
+                    } else if (type == kBackwardFilterTest) {
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                      test.run();
+                    }
                   }
                 }
               }
@@ -132,8 +126,8 @@ class ConvolutionTest2 {
 public:
   ConvolutionTest2(const std::string& conv1,
                    const std::string& conv2,
-                   LayerType layerType,
                    TestType type,
+                   bool useGroups = true,
                    std::string algo = "auto") {
     for (size_t batchSize : {16}) {
       for (size_t inputHeight : {7, 31}) {
@@ -142,78 +136,78 @@ public:
             for (size_t filterWidth : {3, 7}) {
               for (size_t inputChannels : {7}) {
                 for (size_t outputChannels : {7, 32}) {
-                  if (layerType == depthwiseConvolutionType &&
-                      outputChannels % inputChannels != 0)
-                    break;
-
-                  size_t groups = 1;
-
-                  if (layerType == depthwiseConvolutionType) {
-                    groups = inputChannels;
-                  }
-                  size_t stride = 1;
-                  size_t padding = 0;
-                  size_t outputHeight =
-                      (inputHeight - filterHeight + 2 * padding + stride) /
-                      stride;
-                  size_t outputWidth =
-                      (inputWidth - filterWidth + 2 * padding + stride) /
-                      stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputHeight
-                          << " inputWidth=" << inputWidth
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterHeight
-                          << " filterWidth=" << filterWidth
-                          << " outputHeight=" << outputHeight
-                          << " outputWidth=" << outputWidth
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", groups)
-                          .set("algo", algo));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputHeight, inputWidth};
-
-                  TensorShape filter;
-                  if (layerType == depthwiseConvolutionType)
-                    filter = TensorShape({groups,
-                                          outputChannels / groups,
-                                          (size_t)1,
-                                          filterHeight,
-                                          filterWidth});
-                  else
-                    filter = TensorShape({outputChannels,
-                                          inputChannels,
-                                          filterHeight,
-                                          filterWidth});
-                  TensorShape output{
-                      batchSize, outputChannels, outputHeight, outputWidth};
-
-                  if (type == kForwardTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.run();
-                  } else if (type == kBackwardInputTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                    test.run();
-                  } else if (type == kBackwardFilterTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.run();
+                  for (size_t groups : {1, 7}) {
+                    if (!useGroups && groups != 1 &&
+                        (inputChannels != groups ||
+                         outputChannels % groups != 0))
+                      continue;
+                    if (!useGroups) groups = 1;
+
+                    size_t stride = 1;
+                    size_t padding = 0;
+                    size_t outputHeight =
+                        (inputHeight - filterHeight + 2 * padding + stride) /
+                        stride;
+                    size_t outputWidth =
+                        (inputWidth - filterWidth + 2 * padding + stride) /
+                        stride;
+                    VLOG(3) << " batchSize=" << batchSize
+                            << " inputChannels=" << inputChannels
+                            << " inputHeight=" << inputHeight
+                            << " inputWidth=" << inputWidth
+                            << " outputChannels=" << outputChannels
+                            << " filterHeight=" << filterHeight
+                            << " filterWidth=" << filterWidth
+                            << " outputHeight=" << outputHeight
+                            << " outputWidth=" << outputWidth
+                            << " stride=" << stride << " padding=" << padding;
+
+                    std::vector<size_t> paddings = {padding, padding};
+                    std::vector<size_t> strides = {stride, stride};
+                    Compare2Function<DType1, DType2> test(
+                        conv1,
+                        conv2,
+                        FuncConfig()
+                            .set("paddings", paddings)
+                            .set("strides", strides)
+                            .set("groups", groups)
+                            .set("algo", algo));
+
+                    TensorShape input{
+                        batchSize, inputChannels, inputHeight, inputWidth};
+
+                    TensorShape filter;
+                    if (groups > 1)
+                      filter = TensorShape({groups,
+                                            outputChannels / groups,
+                                            inputChannels / groups,
+                                            filterHeight,
+                                            filterWidth});
+                    else
+                      filter = TensorShape({outputChannels,
+                                            inputChannels,
+                                            filterHeight,
+                                            filterWidth});
+                    TensorShape output{
+                        batchSize, outputChannels, outputHeight, outputWidth};
+
+                    if (type == kForwardTest) {
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                      test.run();
+                    } else if (type == kBackwardInputTest) {
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input),
+                                      ADD_TO);
+                      test.run();
+                    } else if (type == kBackwardFilterTest) {
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                      test.run();
+                    }
                   }
                 }
               }
@@ -225,107 +219,34 @@ public:
   }
 };
 
-// ======Start Convolution TEST======
 TEST(Forward, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", convolutionType, kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", convolutionType, kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(Forward, GEMM2) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", convolutionType, kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", convolutionType, kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
 }
 
 TEST(BackwardInput, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU",
-      "GemmConvGradInput-GPU",
-      convolutionType,
-      kBackwardInputTest);
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU",
-      "GemmConvGradInput-GPU",
-      convolutionType,
-      kBackwardInputTest);
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
 }
 
 TEST(BackwardFilter, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU",
-      "GemmConvGradFilter-GPU",
-      convolutionType,
-      kBackwardFilterTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU",
-      "GemmConvGradFilter-GPU",
-      convolutionType,
-      kBackwardFilterTest);
-}
-#endif
-// ======End Convolution TEST======
-
-// ======Start DepthwiseConvolution TEST======
-// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
-// version of depthwiseConv is implemented.
-
-#ifndef PADDLE_ONLY_CPU
-TEST(DepthwiseConvForward, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-GPU",
-      "DepthwiseConv-GPU",
-      depthwiseConvolutionType,
-      kForwardTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-GPU",
-      "DepthwiseConv-GPU",
-      depthwiseConvolutionType,
-      kForwardTest);
-}
-
-TEST(DepthwiseConvForward, GEMM2) {
-  ConvolutionTest<DEVICE_TYPE_GPU, DEVICE_TYPE_GPU> test(
-      "DepthwiseConv-GPU",
-      "DepthwiseConv-GPU",
-      depthwiseConvolutionType,
-      kForwardTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "DepthwiseConv-GPU",
-      "DepthwiseConv-GPU",
-      depthwiseConvolutionType,
-      kForwardTest);
-}
-
-TEST(DepthwiseConvBackwardInput, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "DepthwiseConvGradInput-GPU",
-      "DepthwiseConvGradInput-GPU",
-      depthwiseConvolutionType,
-      kBackwardInputTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "DepthwiseConvGradInput-GPU",
-      "DepthwiseConvGradInput-GPU",
-      depthwiseConvolutionType,
-      kBackwardInputTest);
-}
-
-TEST(DepthwiseConvBackwardFilter, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "DepthwiseConvGradFilter-GPU",
-      "DepthwiseConvGradFilter-GPU",
-      depthwiseConvolutionType,
-      kBackwardFilterTest);
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "DepthwiseConvGradFilter-GPU",
-      "DepthwiseConvGradFilter-GPU",
-      depthwiseConvolutionType,
-      kBackwardFilterTest);
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
 }
 #endif
-// ======End DepthwiseConvolution TEST======
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 224ef0d51b..783e02e47c 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -39,21 +39,22 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
   filterShape_.resize(numInputs);
   outputShape_.resize(numInputs);
 
-  string convType;
-  string convGradInputType;
-  string convGradFilterType;
+  std::string convType;
+  std::string convGradInputType;
+  std::string convGradFilterType;
 
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
 
     if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
-      convType = "DepthwiseConv" convGradInputType =
-          "DepthwiseConvGradInput" convGradFilterType =
-              "DepthwiseConvGradFilter"
+      convType = "DepthwiseConv";
+      convGradInputType = "DepthwiseConvGradInput";
+      convGradFilterType = "DepthwiseConvGradFilter";
     } else {
-      convType = "GemmConv" convGradInputType =
-          "GemmConvGradInput" convGradFilterType = "GemmConvGradFilter"
+      convType = "GemmConv";
+      convGradInputType = "GemmConvGradInput";
+      convGradFilterType = "GemmConvGradFilter";
     }
 
     if (FLAGS_use_nnpack) {
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 2f28cec53e..2b45483bcc 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -349,13 +349,13 @@ TEST(Layer, CosSimVecMatLayer) {
 
 void testDepthwiseConvLayer(const string& type, bool useGpu) {
   TestConfig config;
-  config.biasSize = 16;
+  config.biasSize = 32;
   config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_num_filters(32);
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192 / 2});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
@@ -388,8 +388,11 @@ void testDepthwiseConvLayer(const string& type, bool useGpu) {
 }
 
 TEST(Layer, depthwiseConvLayer) {
+  //  'depthwise_conv' is a sepecial case of 'exconv' whose
+  //  groups size equals to the input channels size.
+  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
 #ifndef PADDLE_ONLY_CPU
-  testDepthwiseConvLayer("depthwise_conv", /* useGpu= */ true);
+  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
 #endif
 }
 

From e53a48b46a143217a39b5f1c9125c4a7d507d2b5 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 19 Jul 2017 22:27:41 +0800
Subject: [PATCH 039/100] Add memcpy

---
 paddle/memory/memory.cc | 46 +++++++++++++++++++++++++++++------------
 paddle/memory/memory.h  | 17 ++++++++++-----
 2 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5be9bef3ac..5c7b3bb15e 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -15,9 +15,6 @@ limitations under the License. */
 #include "paddle/memory/memory.h"
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
-
-#include <boost/variant.hpp>
 
 namespace paddle {
 namespace memory {
@@ -49,16 +46,9 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 
 template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::CPUPlace, void* src,
-                                                  size_t size) {
-  memcpy(dst, src, size);
-}
-
-template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::CPUPlace, void* src,
-                                                  size_t size) {
-  memcpy(dst, src, size);
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  memcpy(dst, src, num);
 }
 
 #ifndef PADDLE_ONLY_CPU
@@ -93,6 +83,36 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::GPUPlace,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace memory
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 96c00cb106..3ac359e174 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,22 +14,29 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
 namespace paddle {
 namespace memory {
 
-template <class Place>
+template <typename Place>
 void* Alloc(Place, size_t);
 
-template <class Place>
+template <typename Place>
 void Free(Place, void*);
 
-template <class Place>
+template <typename Place>
 size_t Used(Place);
 
-template <class Place1, class Place2>
-void Copy(Place1, void* dst, Place2, void* src, size_t size);
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifndef PADDLE_ONLY_CPU
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace memory
 }  // namespace paddle

From b05886414b4f7bdbc9a6228e6cb681c7f3ccb50c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 19 Jul 2017 22:28:06 +0800
Subject: [PATCH 040/100] Add cuda memcpy in gpu_info

---
 paddle/platform/gpu_info.cc | 20 +++++++++++++++++++-
 paddle/platform/gpu_info.h  | 15 ++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index a1383d3524..12dc01d1a1 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -44,7 +44,7 @@ void SetDeviceId(int id) {
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }
 
-void GpuMemoryUsage(size_t& available, size_t& total) {
+void GpuMemoryUsage(size_t &available, size_t &total) {
   throw_on_error(cudaMemGetInfo(&available, &total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
@@ -82,5 +82,23 @@ size_t GpuMaxChunkSize() {
   return usable;
 }
 
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind) {
+  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind));
+  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
+  //       block on stream 0 to make sure the copy has completed
+  PADDLE_ENFORCE(cudaStreamSynchronize(0));
+}
+
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 79e71956bd..d3a5f5f13f 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
+#include <cuda_runtime.h>
 #include <stddef.h>
 
 namespace paddle {
@@ -31,7 +32,7 @@ int GetCurrentDeviceId();
 void SetDeviceId(int device_id);
 
 //！Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t& available, size_t& total);
+void GpuMemoryUsage(size_t &available, size_t &total);
 
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
@@ -42,6 +43,18 @@ size_t GpuMinChunkSize();
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();
 
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind);
+
+//! Copy memory from one device to another device.
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream);
+
 }  // namespace platform
 }  // namespace paddle
 

From 736d078cbf07fc1fc610a90e2bedc7bc57398224 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 19 Jul 2017 22:30:34 +0800
Subject: [PATCH 041/100] replace Tensor::tensor to EigenTensor::From

---
 paddle/operators/mul_op.cc         |  6 +++---
 paddle/operators/mul_op.cu         |  4 ++--
 paddle/operators/mul_op.h          | 11 +++++++----
 paddle/operators/rowwise_add_op.cc |  4 ++--
 paddle/operators/rowwise_add_op.cu |  4 ++--
 paddle/operators/rowwise_add_op.h  | 11 ++++++-----
 paddle/operators/sigmoid_op.cc     |  4 ++--
 paddle/operators/sigmoid_op.cu     |  4 ++--
 paddle/operators/sigmoid_op.h      | 10 ++++++----
 paddle/operators/softmax_op.cc     |  4 ++--
 paddle/operators/softmax_op.cu     |  4 ++--
 paddle/operators/softmax_op.h      |  9 +++++----
 12 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 7aa63961a0..fa22478689 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -12,9 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/framework/op_registry.h>
-#include <paddle/framework/tensor.h>
-#include <paddle/operators/mul_op.h>
+#include "paddle/operators/mul_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index 75f00e746c..3ee581dc77 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -12,8 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/operators/mul_op.h>
-#include <paddle/framework/op_registry.h>
+#include "paddle/operators/mul_op.h"
+#include "paddle/framework/op_registry.h"
 
 REGISTER_OP_GPU_KERNEL(mul,
                        paddle::operators::MulKernel<paddle::platform
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 13e5b6a950..009ae8757f 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace operators {
@@ -34,8 +35,10 @@ public:
 
     output->mutable_data<T>(context.GetPlace());
 
-    output->matrix<T>().device(*(context.GetEigenDevice<Place>())) =
-        input0.matrix<T>().contract(input1.matrix<T>(), dim_pair);
+    framework::EigenMatrix<T>::From(*output).device(
+        *(context.GetEigenDevice<Place>())) =
+        framework::EigenMatrix<T>::From(input0).contract(
+            framework::EigenMatrix<T>::From(input1), dim_pair);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 567b058fd0..2590dff7bc 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -12,8 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/rowwise_add_op.h>
+#include "paddle/operators/rowwise_add_op.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 58fe96a4a3..5dfac4fd2c 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -1,5 +1,5 @@
-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/rowwise_add_op.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/rowwise_add_op.h"
 
 REGISTER_OP_GPU_KERNEL(
     rowwise_add,
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index f1d43002dc..ffe9378404 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -13,8 +13,9 @@
    limitations under the License. */
 
 #pragma once
-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace operators {
@@ -27,9 +28,9 @@ public:
     auto in1 = context.Input(1)->Get<framework::Tensor>();
     auto* out = context.Output(0)->GetMutable<framework::Tensor>();
 
-    auto input = in0.matrix<T>();
-    auto bias = in1.vec<T>();
-    auto output = out->matrix<T>();
+    auto input = framework::EigenMatrix<T>::From(in0);
+    auto bias = framework::EigenVector<T>::From(in1);
+    auto output = framework::EigenMatrix<T>::From(*out);
 
     const int bias_size = bias.dimension(0);
     const int rest_size = input.size() / bias_size;
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index fa13f2c4f7..589b48ce80 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -12,8 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/sigmoid_op.h>
+#include "paddle/operators/sigmoid_op.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
index 59bba2729f..ed344b2bfd 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -1,5 +1,5 @@
-#include <paddle/operators/sigmoid_op.h>
-#include <paddle/framework/op_registry.h>
+#include "paddle/operators/sigmoid_op.h"
+#include "paddle/framework/op_registry.h"
 
 REGISTER_OP_GPU_KERNEL(
     sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 7995b75297..2b9356246c 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace operators {
@@ -29,8 +30,9 @@ public:
 
     output->mutable_data<T>(context.GetPlace());
 
-    output->flat<T>().device(*(context.GetEigenDevice<Place>())) =
-        1.0 / (1.0 + (-1.0 * input.flat<T>()).exp());
+    framework::EigenVector<T>::Flatten(*output).device(
+        *(context.GetEigenDevice<Place>())) =
+        1.0 / (1.0 + (-1.0 * framework::EigenVector<T>::Flatten(input)).exp());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 42795adbdc..81bad74865 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -11,8 +11,8 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/softmax_op.h>
+#include "paddle/operators/softmax_op.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index 730c76a04b..60676191eb 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -1,5 +1,5 @@
-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/softmax_op.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/softmax_op.h"
 
 REGISTER_OP_GPU_KERNEL(
     softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 34a6c299bb..53c626a792 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace operators {
@@ -27,8 +28,8 @@ public:
     auto input = context.Input(0)->Get<framework::Tensor>();
     auto* output = context.Output(0)->GetMutable<framework::Tensor>();
 
-    auto logits = input.matrix<T>();
-    auto softmax = output->matrix<T>();
+    auto logits = framework::EigenMatrix<T>::From(input);
+    auto softmax = framework::EigenMatrix<T>::From(*output);
 
     const int kBatchDim = 0;
     const int kClassDim = 1;

From a6347daa0578d403cc2cdc3169d68bed86bac4d1 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 19 Jul 2017 22:48:24 +0800
Subject: [PATCH 042/100] fix clang compile warning

---
 paddle/operators/mul_op.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 7bd1f7e759..e6bad7fb9d 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -26,9 +26,7 @@ class MulKernel : public framework::OpKernel {
 public:
   void Compute(const framework::KernelContext& context) const override {
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
-        Eigen::IndexPair<Eigen::DenseIndex>(1, 0)};
-    // dim_pair[0].first = 1;
-    // dim_pair[0].second = 0;
+        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
 
     auto input0 = context.Input(0)->Get<framework::Tensor>();
     auto input1 = context.Input(1)->Get<framework::Tensor>();

From 527c85970bcc16fae2598f984baa6d648fde4052 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 19 Jul 2017 22:51:27 +0800
Subject: [PATCH 043/100] Fix H2D and D2H order

---
 paddle/memory/memory.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5c7b3bb15e..67d2ae1bbd 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -88,7 +88,7 @@ void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::GPUPlace,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 
 template <>
@@ -96,7 +96,7 @@ void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace, void* dst,
                                                   platform::CPUPlace,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 
 template <>

From bd54eb98346974b5c289f7c0f4861e4d373c79bb Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Wed, 19 Jul 2017 23:56:23 +0800
Subject: [PATCH 044/100] tiny modify the test

---
 paddle/function/ConvOpTest.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
index c96c8d9eea..f0c45c97b1 100644
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@@ -37,7 +37,7 @@ public:
       for (size_t inputSize : {7, 14, 54}) {
         for (size_t filterSize : {1, 3, 5}) {
           for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64, 128}) {
+            for (size_t outputChannels : {3, 64}) {
               for (size_t groups : {1, 3, 64}) {
                 if (inputChannels > outputChannels) break;
                 if (groups != 1 &&
@@ -135,11 +135,10 @@ public:
           for (size_t filterHeight : {1, 5}) {
             for (size_t filterWidth : {3, 7}) {
               for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {7, 32}) {
+                for (size_t outputChannels : {7}) {
                   for (size_t groups : {1, 7}) {
-                    if (!useGroups && groups != 1 &&
-                        (inputChannels != groups ||
-                         outputChannels % groups != 0))
+                    if (groups != 1 && (inputChannels != groups ||
+                                        outputChannels % groups != 0))
                       continue;
                     if (!useGroups) groups = 1;
 

From 4876f358580a1d09e14a5c29a6abbfee4bc8aae4 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 20 Jul 2017 00:18:39 +0800
Subject: [PATCH 045/100] "make plainNet shared"

---
 paddle/framework/CMakeLists.txt | 2 +-
 paddle/framework/net.cc         | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 74937b2b71..d018ee50c0 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -29,4 +29,4 @@ add_dependencies(framework_py_proto framework_py_proto_init)
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
 # cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
 cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net my_fc_op)
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 2abc5d3417..bb02dcbcee 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -21,10 +21,9 @@ namespace paddle {
 namespace framework {
 
 std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps) {
-  // NetPtr->reset(new PlainNet);
-  // NetPtr grad_ops = new PlainNet;
-  std::shared_ptr<PlainNet> grad_ops;
-  grad_ops.reset(new PlainNet);
+  auto grad_ops = std::make_shared<PlainNet>();
+  // std::shared_ptr<PlainNet> grad_ops;
+  // grad_ops.reset(new PlainNet);
   for (auto& op : ForwardOps->ops_) {
     auto op_grad = OpRegistry::CreateGradOp(op);
     grad_ops->AddOp(op_grad);

From e192d0fd017c14e8d8366a6451870d3ed0085dee Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 20 Jul 2017 02:14:31 +0800
Subject: [PATCH 046/100] Refactor the implementation of gradient Op creating

---
 paddle/framework/grad_op_creator.cc |  97 +++++++++++++++++++++++
 paddle/framework/grad_op_creator.h  |  46 +++++++++++
 paddle/framework/op_registry.h      | 118 +++-------------------------
 3 files changed, 152 insertions(+), 109 deletions(-)
 create mode 100644 paddle/framework/grad_op_creator.cc
 create mode 100644 paddle/framework/grad_op_creator.h

diff --git a/paddle/framework/grad_op_creator.cc b/paddle/framework/grad_op_creator.cc
new file mode 100644
index 0000000000..dbc10d5ad5
--- /dev/null
+++ b/paddle/framework/grad_op_creator.cc
@@ -0,0 +1,97 @@
+#include "paddle/framework/grad_op_creator.h"
+
+namespace paddle {
+namespace framework {
+
+OperatorBase* GradOpCreator::Create() {
+  BuildOpInOutArgList();
+  OperatorBase* grad_op = OpRegistry::grad_creators().at(op_->type_)();
+  CompleteGradOp(grad_op);
+  return grad_op;
+}
+
+OpInOutArg* GradOpCreator::BuildArg(const VarProto& var,
+                                    const VarIndexMap& var_map,
+                                    const vector<int>& format, InOutType type) {
+  int idx = var_map.at(var.name());
+  int begin_idx = format.empty() ? idx : format.at(idx);
+  int end_idx = format.empty() ? idx + 1 : format.at(idx + 1);
+  return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx,
+                        end_idx);
+}
+
+void GradOpCreator::BuildOpInOutArgList() {
+  const OpProto& op_proto = OpRegistry::protos().at(op_->type);
+  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op->type_));
+  const vector<int>& in_format =
+      op_->attrs_.count("input_format")
+          ? op->GetAttr<std::vector<int>>("input_format")
+          : std::vector<int>();
+  const vector<int>& out_format =
+      op_->attrs_.count("output_format")
+          ? op->GetAttr<std::vector<int>>("output_format")
+          : std::vector<int>();
+  for (const auto& var : op_proto.inputs()) {
+    arg_list_.emplace_back(
+        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, in_format, IN)));
+  }
+  for (const auto& var : op_proto.outputs()) {
+    arg_list_.emplace_back(
+        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, out_format, OUT)));
+  }
+}
+
+void GradOpCreator::PushArgIntoGradOp(const OpInOutArg* arg,
+                                      vector<std::string>& in_out,
+                                      vector<int>& format, VarIndexMap* varmap,
+                                      int& idx, bool is_grad) {
+  std::string var_name = arg->proto_name_;
+  if (is_grad) {
+    var_name += OperatorBase::GRAD_VAR_SUFFIX();
+  }
+  *(varmap)[var_name] = idx++;
+  size_t pre_sz = in_out.size();
+  auto base_it = arg->type == IN ? op_->inputs_.begin() : op_->outputs_.begin();
+  std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
+            std::back_inserter(in_out));
+  if (is_grad) {
+    for (size_t i = pre_sz; i < in_out.size(); ++i) {
+      in_out[i] += OperatorBase::GRAD_VAR_SUFFIX();
+    }
+  }
+  format.push_back(in_out.size());
+}
+
+void GradOpCreator::CompleteGradOp(OperatorBase* grad_op) const {
+  grad_op->type_ = op_->type_ + "@GRAD";  // not necessary
+  grad_op->attrs_ = op_->attrs_;
+  grad_op->attrs_.erase("input_format");
+  grad_op->attrs_.erase("output_format");
+  VarIndexMap* grad_varmap = new VarIndexMap();
+  int in_idx = 0;
+  int out_idx = 0;
+  vector<int> in_format({0});
+  vector<int> out_format({0});
+  for (const auto& arg : arg_list_) {
+    // op_'s inputs_ and outputs_
+    if (arg->needed_in_grad_) {
+      PushArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                        in_idx, false);
+    }
+    if (arg->type_ == IN) {
+      // gradients of op_'s inputs_
+      PushArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
+                        out_idx, true);
+    } else {
+      // gradients of op_'s outputs_
+      PushArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                        in_idx, true);
+    }
+  }
+  grad_op->attrs_["input_format"] = in_format;
+  grad_op->attrs_["output_format"] = out_format;
+  grad_op->in_out_idxs_.reset(grad_varmap);
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/grad_op_creator.h b/paddle/framework/grad_op_creator.h
new file mode 100644
index 0000000000..441aae4979
--- /dev/null
+++ b/paddle/framework/grad_op_creator.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class OpRegistry;
+
+class GradOpCreator {
+ public:
+  GradOpCreator(const OperatorBase* op) : op_(op) {}
+  OperatorBase* Create();
+
+ private:
+  enum InOutType { IN, OUT };
+
+  struct OpInOutArg {
+    OpInOutArg(const std::string& proto_name, const InOutType& type,
+               bool needed_in_grad, size_t begin_idx, size_t end_idx)
+        : proto_name_(proto_name),
+          type_(type),
+          needed_in_grad_(needed_in_grad),
+          begin_idx_(begin_idx),
+          end_idx_(end_idx) {}
+
+    std::string proto_name_;
+    InOutType type_;
+    bool needed_in_grad_;
+    size_t begin_idx_;
+    size_t end_idx_;
+  };
+
+  OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
+                       const vector<int>& format, InOutType type);
+  void BuildOpInOutArgList();
+  void PushArgIntoGradOp(const OpInOutArg* arg, vector<std::string>& in_out,
+                         vector<int>& format, VarIndexMap* varmap, int& idx,
+                         bool is_grad);
+  void CompleteGradOp(OperatorBase* grad_op) const;
+  const OperatorBase* op_;
+  std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 4a197102d6..fcb529bbac 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -6,9 +6,8 @@
 #include <unordered_map>
 #include <unordered_set>
 #include "paddle/framework/attr_checker.h"
+#include "paddle/framework/grad_op_creater.h"
 #include "paddle/framework/op_desc.pb.h"
-#include "paddle/framework/op_proto.pb.h"
-#include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 
 namespace paddle {
@@ -286,13 +285,8 @@ class OpRegistry {
   }
 
   static OperatorPtr CreateGradOp(OperatorPtr op) {
-    OperatorPtr grad_op(grad_creators().at(op->type_)());
-    grad_op->type_ = op->type_;
-
-    AssembleGradInOut(op, grad_op);
-    GenerateGradArgOffset(op, grad_op);
-    GenerateGradAttr(op, grad_op);
-
+    GradOpCreator creator(op.get());
+    OperatorPtr grad_op(creator.Create());
     grad_op->Init();
     return grad_op;
   }
@@ -302,13 +296,18 @@ class OpRegistry {
     return protos_;
   };
 
- private:
+  static std::unordered_map<std::string, OpCreator>& grad_creators() {
+    static std::unordered_map<std::string, OpCreator> grad_creators_;
+    return grad_creators_;
+  }
+
   static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
   VarIndexMaps() {
     static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
     return maps_;
   }
 
+ private:
   static std::unordered_map<std::string, OpCreator>& creators() {
     static std::unordered_map<std::string, OpCreator> creators_;
     return creators_;
@@ -319,11 +318,6 @@ class OpRegistry {
     return op_checkers_;
   };
 
-  static std::unordered_map<std::string, OpCreator>& grad_creators() {
-    static std::unordered_map<std::string, OpCreator> grad_creators_;
-    return grad_creators_;
-  }
-
   static void GenerateTempVariableName(OperatorBase* op) {
     static std::atomic<size_t> gUniqId(0UL);
     for (auto& outname : op->outputs_) {
@@ -334,100 +328,6 @@ class OpRegistry {
       }
     }
   }
-
-  static void AssembleGradInOut(OperatorPtr op, OperatorPtr grad_op) {
-    size_t in_sz = op->inputs_.size() + op->outputs_.size() * 2;
-    grad_op->inputs_.reserve(in_sz);
-    size_t out_sz = op->inputs_.size();
-    grad_op->outputs_.reserve(out_sz);
-    // copy op->inputs_ to grad_op->inputs_
-    std::copy(op->inputs_.begin(), op->inputs_.end(),
-              std::back_inserter(grad_op->inputs_));
-    // copy op->outputs_ to grad_op->inputs_
-    std::copy(op->outputs_.begin(), op->outputs_.end(),
-              std::back_inserter(grad_op->inputs_));
-    // add gradients of op->outputs_ to grad_op->inputs_
-    for (const std::string& name : op->outputs_) {
-      grad_op->inputs_.emplace_back(name + OperatorBase::GRAD_VAR_SUFFIX());
-    }
-    // add gradients of op->inputs_ to grad_op->outputs_
-    for (const std::string& name : op->inputs_) {
-      grad_op->outputs_.emplace_back(name + OperatorBase::GRAD_VAR_SUFFIX());
-    }
-  }
-
-  static void GenerateGradArgOffset(OperatorPtr op, OperatorPtr grad_op) {
-    VarIndexMap* grad_varmap = new VarIndexMap();
-    const OpProto& op_proto = protos()[op->type_];
-    int idx = 0;
-    // offset of op's inputs
-    for (const auto& var : op_proto.inputs()) {
-      (*grad_varmap)[var.name()] = idx++;
-    }
-    // offset of op's outputs
-    for (const auto& var : op_proto.outputs()) {
-      (*grad_varmap)[var.name()] = idx++;
-    }
-    // offset of gradients of op's output
-    for (const auto& var : op_proto.outputs()) {
-      (*grad_varmap)[var.name() + OperatorBase::GRAD_VAR_SUFFIX()] = idx++;
-    }
-    idx = 0;
-    // offset of gradients of op's input
-    for (const auto& var : op_proto.inputs()) {
-      (*grad_varmap)[var.name() + OperatorBase::GRAD_VAR_SUFFIX()] = idx++;
-    }
-    grad_op->in_out_idxs_.reset(grad_varmap);
-  }
-
-  static void GenerateGradAttr(OperatorPtr op, OperatorPtr grad_op) {
-    const OpProto& op_proto = protos()[op->type_];
-    grad_op->attrs_ = op->attrs_;
-    grad_op->attrs_.erase("input_format");
-    grad_op->attrs_.erase("output_format");
-    bool has_in_format = op->attrs_.count("input_format");
-    bool has_out_format = op->attrs_.count("output_format");
-    // grad_op's inputs_ contains op's inputs_, outputs_ and gradients of
-    // outpus_. So grad_op's input_format is necessary when op has
-    // either input_format or output_format.
-    if (has_in_format || has_out_format) {
-      std::vector<int> old_in_format;
-      std::vector<int> old_out_format;
-      has_in_format
-          ? old_in_format = op->GetAttr<std::vector<int>>("input_format")
-          : old_in_format = std::vector<int>(op_proto.inputs_size()),
-            std::iota(old_in_format.begin(), old_in_format.end(), 0);
-      has_out_format
-          ? old_out_format = op->GetAttr<std::vector<int>>("output_format")
-          : old_out_format = std::vector<int>(op_proto.outputs_size()),
-            std::iota(old_out_format.begin(), old_out_format.end(), 0);
-
-      std::vector<int> in_format;
-      in_format.reserve(old_in_format.size() + old_out_format.size() * 2);
-      int base = 0;
-      for (const int& idx : old_in_format) {
-        in_format.emplace_back(idx + base);
-      }
-      base += op->inputs_.size();
-      for (const int& idx : old_out_format) {
-        in_format.emplace_back(idx + base);
-      }
-      base += op->outputs_.size();
-      for (const int& idx : old_in_format) {
-        in_format.emplace_back(idx + base);
-      }
-      grad_op->attrs_["input_format"] = in_format;
-      // grad_op's outputs_ contains gradients of op's inputs_. So grad_op's
-      // output_format is necessary only when op has input_format.
-      if (has_in_format) {
-        std::vector<int> out_format;
-        out_format.reserve(op_proto.inputs_size());
-        std::copy(old_in_format.begin(), old_in_format.end(),
-                  std::back_inserter(out_format));
-        grad_op->attrs_["output_format"] = out_format;
-      }
-    }
-  }
 };
 
 template <typename OpType, typename ProtoMakerType>

From 00500eeb7dcf388261d3145e0ac521d0b1e10dc2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 20 Jul 2017 09:40:27 +0800
Subject: [PATCH 047/100] Add stdlib.h for memcpy

---
 paddle/memory/memory.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 67d2ae1bbd..a7d7fa0bfe 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
 
+#include <stdlib.h>  // for memcpy
+
 namespace paddle {
 namespace memory {
 

From 14424f314c4d2018b49ad242c82738a21d2fe9e3 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 20 Jul 2017 10:03:53 +0800
Subject: [PATCH 048/100] "use built-in operator"

---
 paddle/framework/CMakeLists.txt |  2 +-
 paddle/framework/net_op_test.cc | 19 +++++++++++++------
 paddle/framework/op_registry.h  |  6 +++---
 paddle/operators/add_op.cc      | 15 +++++++++++++++
 paddle/operators/add_op_test.cc |  7 ++++++-
 paddle/operators/mul_op.cc      | 13 +++++++++++++
 paddle/operators/sigmoid_op.cc  | 13 +++++++++++++
 7 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index d018ee50c0..b56107daf1 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -29,4 +29,4 @@ add_dependencies(framework_py_proto framework_py_proto_init)
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
 # cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
 cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net my_fc_op)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op)
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index 18151c56d9..2e74235261 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -2,7 +2,10 @@
 #include <paddle/framework/net.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/operator.h>
-#include "paddle/framework/fully_connected_op.h"
+
+USE_OP(add_two);
+USE_OP(mul);
+USE_OP(sigmoid);
 
 namespace paddle {
 namespace framework {
@@ -65,14 +68,18 @@ TEST(OpKernel, all) {
 
   ASSERT_THROW(net->AddOp(op2), EnforceNotMet);
 }
-
 TEST(AddBackwardOp, TestGradOp) {
   auto net = std::make_shared<PlainNet>();
   ASSERT_NE(net, nullptr);
-  auto op1 = std::make_shared<FCOp>();
-  op1->inputs_ = {"x", "w1", "b1"};
-  op1->outputs_ = {"y"};
-  net->AddOp(op1);
+  net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {}));
+  net->AddOp(
+      framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
+  net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, {}));
+  // net->AddOp(framework::OpRegistry::CreateOp("fc"), {
+  //     Input("X"), Input("W"), Input("b")},
+  //   {Output("Y")},
+  //   {}
+  //   );
   auto grad_ops = AddBackwardOp(net);
   for (auto& op : grad_ops->ops_) {
     op->DebugString();
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 92354f4ffd..07c3399462 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -470,11 +470,11 @@ class GradOpRegisterHelper {
  */
 #define REGISTER_GRADIENT_OP(__op_type, __op_class)            \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
-      __reg_op__##__op_type,                                   \
+      __reg_gradient_op__##__op_type,                          \
       "REGISTER_GRADIENT_OP must be in global namespace");     \
   static ::paddle::framework::GradOpRegisterHelper<__op_class> \
-      __op_register_##__op_type##__(#__op_type);               \
-  int __op_register_##__op_type##_handle__() { return 0; }
+      __op_gradient_register_##__op_type##__(#__op_type);      \
+  int __op_gradient_register_##__op_type##_handle__() { return 0; }
 
 /**
  * Macro to Register OperatorKernel.
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 41d044cdb7..f59a027407 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -49,10 +49,25 @@ The equation is: Out = X + Y
 )DOC");
   }
 };
+
+class AddOpGrad : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {}
+  std::string DebugString() const override {
+    LOG(INFO) << "AddOpGrad";
+    return "";
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
+REGISTER_GRADIENT_OP(add_two, paddle::operators::AddOpGrad);
+
 typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
     AddKernel_CPU_float;
 REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
+// REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
diff --git a/paddle/operators/add_op_test.cc b/paddle/operators/add_op_test.cc
index 53b354fedc..7fc1049893 100644
--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
@@ -16,8 +16,13 @@ limitations under the License. */
 #define private public
 #include <paddle/framework/op_registry.h>
 USE_OP(add_two);
+// USE_OP(add_two_grad);
+
 TEST(AddOp, GetOpProto) {
   auto& protos = paddle::framework::OpRegistry::protos();
   auto it = protos.find("add_two");
   ASSERT_NE(it, protos.end());
-}
\ No newline at end of file
+  auto& grad_creators = paddle::framework::OpRegistry::grad_creators();
+  auto it1 = grad_creators.find("add_two");
+  ASSERT_NE(it1, grad_creators.end());
+}
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 713b2a5dc8..ebf345194c 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -52,9 +52,22 @@ The equation is: Out = X * Y
   }
 };
 
+class MulOpGrad : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {}
+  std::string DebugString() const override {
+    LOG(INFO) << "MulGrad";
+    return "";
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
+REGISTER_GRADIENT_OP(mul, paddle::operators::MulOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 45ae277c53..16348db020 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -39,11 +39,24 @@ public:
   }
 };
 
+class SigmoidOpGrad : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {}
+  std::string DebugString() const override {
+    LOG(INFO) << "SigmoidGrad";
+    return "";
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 REGISTER_OP(sigmoid,
             paddle::operators::SigmoidOp,
             paddle::operators::SigmoidOpMaker);
+REGISTER_GRADIENT_OP(sigmoid, paddle::operators::SigmoidOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);

From 0897d18a0a66b7942d8cecc7c8652192f47df66c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 20 Jul 2017 10:05:54 +0800
Subject: [PATCH 049/100] Fix string.h for memcpy

---
 paddle/memory/memory.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index a7d7fa0bfe..4056a54b4a 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
 
-#include <stdlib.h>  // for memcpy
+#include <cstring>  // for memcpy
 
 namespace paddle {
 namespace memory {
@@ -50,7 +50,7 @@ template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::CPUPlace,
                                                   const void* src, size_t num) {
-  memcpy(dst, src, num);
+  std::memcpy(dst, src, num);
 }
 
 #ifndef PADDLE_ONLY_CPU

From 81a352aff7bc6588b652023b290986ba02301df9 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 20 Jul 2017 10:31:44 +0800
Subject: [PATCH 050/100] "test fc without gradient"

---
 paddle/framework/CMakeLists.txt |  2 +-
 paddle/framework/net.cc         |  2 --
 paddle/framework/net_op_test.cc | 16 +++++++++++-----
 paddle/framework/op_registry.h  |  8 +++++++-
 paddle/operators/softmax_op.cc  | 13 +++++++++++++
 5 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index b56107daf1..5eec31197f 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -29,4 +29,4 @@ add_dependencies(framework_py_proto framework_py_proto_init)
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
 # cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
 cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index bb02dcbcee..8902e2bcf1 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -22,8 +22,6 @@ namespace framework {
 
 std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps) {
   auto grad_ops = std::make_shared<PlainNet>();
-  // std::shared_ptr<PlainNet> grad_ops;
-  // grad_ops.reset(new PlainNet);
   for (auto& op : ForwardOps->ops_) {
     auto op_grad = OpRegistry::CreateGradOp(op);
     grad_ops->AddOp(op_grad);
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index 2e74235261..2f24816bf8 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -6,6 +6,7 @@
 USE_OP(add_two);
 USE_OP(mul);
 USE_OP(sigmoid);
+USE_OP(softmax);
 
 namespace paddle {
 namespace framework {
@@ -75,16 +76,21 @@ TEST(AddBackwardOp, TestGradOp) {
   net->AddOp(
       framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
   net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, {}));
-  // net->AddOp(framework::OpRegistry::CreateOp("fc"), {
-  //     Input("X"), Input("W"), Input("b")},
-  //   {Output("Y")},
-  //   {}
-  //   );
   auto grad_ops = AddBackwardOp(net);
   for (auto& op : grad_ops->ops_) {
     op->DebugString();
   }
 }
 
+// TODO(zhihong): add fc grad without registering.
+// TEST(AddBackwardOp, TestNoGradOp) {
+//   auto net = std::make_shared<PlainNet>();
+//   ASSERT_NE(net, nullptr);
+//   net->AddOp(framework::OpRegistry::CreateOp("fc", {"X", "W", "b"}, {"Y"},
+//   {})); auto grad_ops = AddBackwardOp(net); for (auto& op : grad_ops->ops_) {
+//     op->DebugString();
+//   }
+// }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 07c3399462..0aa1eca837 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -286,7 +286,13 @@ class OpRegistry {
   }
 
   static OperatorPtr CreateGradOp(OperatorPtr op) {
-    OperatorPtr grad_op(grad_creators().at(op->type_)());
+    auto it = grad_creators().find(op->type_);
+    if (it == grad_creators().end()) {
+      LOG(INFO) << op->type_ << "does not has gradient op";
+      return nullptr;
+    }
+    // OperatorPtr grad_op(grad_creators().at(op->type_)());
+    OperatorPtr grad_op(it->second());
     grad_op->type_ = op->type_;
 
     AssembleGradInOut(op, grad_op);
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 4ca7be359e..146326d283 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -40,10 +40,23 @@ public:
   }
 };
 
+class SoftmaxOpGrad : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {}
+  std::string DebugString() const override {
+    LOG(INFO) << "SoftmaxOpGrad";
+    return "";
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
+REGISTER_GRADIENT_OP(softmax, paddle::operators::SoftmaxOpGrad);
+
 REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);

From 8a5ee462a53f7a59edd9dbc5388a60cfab7c1e99 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 20 Jul 2017 10:59:16 +0800
Subject: [PATCH 051/100] Fix some compile errors

---
 paddle/framework/grad_op_creator.h | 12 +++++++-----
 paddle/framework/op_registry.h     |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/paddle/framework/grad_op_creator.h b/paddle/framework/grad_op_creator.h
index 441aae4979..456b066f1d 100644
--- a/paddle/framework/grad_op_creator.h
+++ b/paddle/framework/grad_op_creator.h
@@ -8,6 +8,8 @@ namespace framework {
 class OpRegistry;
 
 class GradOpCreator {
+  using VarIndexMap = std::unordered_map<std::string, int>;
+
  public:
   GradOpCreator(const OperatorBase* op) : op_(op) {}
   OperatorBase* Create();
@@ -32,15 +34,15 @@ class GradOpCreator {
   };
 
   OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
-                       const vector<int>& format, InOutType type);
+                       const std::vector<int>& format, InOutType type);
   void BuildOpInOutArgList();
-  void PushArgIntoGradOp(const OpInOutArg* arg, vector<std::string>& in_out,
-                         vector<int>& format, VarIndexMap* varmap, int& idx,
-                         bool is_grad);
+  void AddArgIntoGradOp(const OpInOutArg* arg, std::vector<std::string>& in_out,
+                        std::vector<int>& format, VarIndexMap* varmap, int& idx,
+                        bool is_grad);
   void CompleteGradOp(OperatorBase* grad_op) const;
   const OperatorBase* op_;
   std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
-}
+};
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index fcb529bbac..fffef31be3 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -6,7 +6,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include "paddle/framework/attr_checker.h"
-#include "paddle/framework/grad_op_creater.h"
+#include "paddle/framework/grad_op_creator.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/scope.h"
 

From b635af71d8894f1f66c12f661ed2caf302dc9513 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 20 Jul 2017 11:00:21 +0800
Subject: [PATCH 052/100] Fix some compile error

---
 paddle/framework/grad_op_creator.cc | 32 +++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/paddle/framework/grad_op_creator.cc b/paddle/framework/grad_op_creator.cc
index dbc10d5ad5..ac3663b7fc 100644
--- a/paddle/framework/grad_op_creator.cc
+++ b/paddle/framework/grad_op_creator.cc
@@ -12,7 +12,8 @@ OperatorBase* GradOpCreator::Create() {
 
 OpInOutArg* GradOpCreator::BuildArg(const VarProto& var,
                                     const VarIndexMap& var_map,
-                                    const vector<int>& format, InOutType type) {
+                                    const std::vector<int>& format,
+                                    InOutType type) {
   int idx = var_map.at(var.name());
   int begin_idx = format.empty() ? idx : format.at(idx);
   int end_idx = format.empty() ? idx + 1 : format.at(idx + 1);
@@ -23,11 +24,11 @@ OpInOutArg* GradOpCreator::BuildArg(const VarProto& var,
 void GradOpCreator::BuildOpInOutArgList() {
   const OpProto& op_proto = OpRegistry::protos().at(op_->type);
   const auto& var_map = *(OpRegistry::VarIndexMaps().at(op->type_));
-  const vector<int>& in_format =
+  const std::vector<int>& in_format =
       op_->attrs_.count("input_format")
           ? op->GetAttr<std::vector<int>>("input_format")
           : std::vector<int>();
-  const vector<int>& out_format =
+  const std::vector<int>& out_format =
       op_->attrs_.count("output_format")
           ? op->GetAttr<std::vector<int>>("output_format")
           : std::vector<int>();
@@ -41,10 +42,11 @@ void GradOpCreator::BuildOpInOutArgList() {
   }
 }
 
-void GradOpCreator::PushArgIntoGradOp(const OpInOutArg* arg,
-                                      vector<std::string>& in_out,
-                                      vector<int>& format, VarIndexMap* varmap,
-                                      int& idx, bool is_grad) {
+void GradOpCreator::AddArgIntoGradOp(const OpInOutArg* arg,
+                                     std::vector<std::string>& in_out,
+                                     std::vector<int>& format,
+                                     VarIndexMap* varmap, int& idx,
+                                     bool is_grad) {
   std::string var_name = arg->proto_name_;
   if (is_grad) {
     var_name += OperatorBase::GRAD_VAR_SUFFIX();
@@ -70,22 +72,22 @@ void GradOpCreator::CompleteGradOp(OperatorBase* grad_op) const {
   VarIndexMap* grad_varmap = new VarIndexMap();
   int in_idx = 0;
   int out_idx = 0;
-  vector<int> in_format({0});
-  vector<int> out_format({0});
+  std::vector<int> in_format({0});
+  std::vector<int> out_format({0});
   for (const auto& arg : arg_list_) {
     // op_'s inputs_ and outputs_
     if (arg->needed_in_grad_) {
-      PushArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
-                        in_idx, false);
+      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                       in_idx, false);
     }
     if (arg->type_ == IN) {
       // gradients of op_'s inputs_
-      PushArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
-                        out_idx, true);
+      AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
+                       out_idx, true);
     } else {
       // gradients of op_'s outputs_
-      PushArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
-                        in_idx, true);
+      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                       in_idx, true);
     }
   }
   grad_op->attrs_["input_format"] = in_format;

From b3115fb01c007abea7e7ea7bf41363c5669e844a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 20 Jul 2017 11:21:37 +0800
Subject: [PATCH 053/100] Add SetDeviceId in memcpy

---
 paddle/memory/memory.cc | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 4056a54b4a..78443cc35a 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -86,18 +86,22 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
 }
 
 template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::GPUPlace,
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 
 template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace, void* dst,
-                                                  platform::CPUPlace,
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 
@@ -108,6 +112,7 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
   if (dst_place == src_place) {
+    platform::SetDeviceId(src_place.device);
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
   } else {
     platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,

From 577bb4e3467aebf07118c69b85c6a246db235be8 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 20 Jul 2017 11:22:38 +0800
Subject: [PATCH 054/100] rename mkllite to mklml

---
 CMakeLists.txt                | 10 +++---
 cmake/cblas.cmake             | 28 ++++------------
 cmake/configure.cmake         |  2 +-
 cmake/external/gtest.cmake    |  6 ++--
 cmake/external/mkldnn.cmake   |  8 ++---
 cmake/external/mkllite.cmake  | 61 -----------------------------------
 cmake/external/mklml.cmake    | 61 +++++++++++++++++++++++++++++++++++
 paddle/math/MathFunctions.cpp |  2 +-
 paddle/math/MathFunctions.h   |  2 +-
 9 files changed, 83 insertions(+), 97 deletions(-)
 delete mode 100644 cmake/external/mkllite.cmake
 create mode 100644 cmake/external/mklml.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fedf5db0b7..62ab4669cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,7 @@ include(simd)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKL_LITE    "Compile PaddlePaddle with mkl lite package."   ${AVX_FOUND})
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -79,8 +79,8 @@ if(ANDROID)
         "Disable RDMA when cross-compiling for Android" FORCE)
     set(WITH_MKLDNN OFF CACHE STRING
         "Disable MKLDNN when cross-compiling for Android" FORCE)
-    set(WITH_MKL_LITE OFF CACHE STRING
-        "Disable MKL lite package when cross-compiling for Android" FORCE)
+    set(WITH_MKLML OFF CACHE STRING
+        "Disable MKLML package when cross-compiling for Android" FORCE)
 endif(ANDROID)
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -94,7 +94,7 @@ endif()
 
 ########################################################################################
 
-include(external/mkllite)   # download mkl minimal lite package
+include(external/mklml)   # download mklml package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -145,7 +145,7 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKL_LITE_LIB_IOMP})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLML_LIB_IOMP})
 endif()
 
 if(USE_NNPACK)
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 52556b1b40..854066fd1d 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -15,17 +15,17 @@
 
 set(CBLAS_FOUND OFF)
 
-## Find MKL Lite First.
-if(WITH_MKL_LITE AND MKL_LITE_INC_DIR AND MKL_LITE_LIB)
+## Find MKLML First.
+if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER MKL_LITE)
-  set(CBLAS_INC_DIR ${MKL_LITE_INC_DIR})
-  set(CBLAS_LIBRARIES ${MKL_LITE_LIB})
+  set(CBLAS_PROVIDER MKLML)
+  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKLML_LIB})
 
-  add_definitions(-DPADDLE_USE_MKL_LITE)
+  add_definitions(-DPADDLE_USE_MKLML)
   add_definitions(-DLAPACK_FOUND)
 
-  message(STATUS "Found cblas and lapack in MKL Lite "
+  message(STATUS "Found cblas and lapack in MKLML "
     "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   return()
 endif()
@@ -43,20 +43,6 @@ set(MKL_LIB_SEARCH_PATHS
   ${INTEL_MKL_ROOT}/lib
   ${INTEL_MKL_ROOT}/lib/intel64)
 
-if(MKL_LITE_INC_DIR AND MKL_LITE_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER MKL_LITE)
-  set(CBLAS_INC_DIR ${MKL_LITE_INC_DIR})
-  set(CBLAS_LIBRARIES ${MKL_LITE_LIB})
-
-  add_definitions(-DPADDLE_USE_MKL_LITE)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found cblas and lapack in MKL Lite "
-    "(include: ${MKL_LITE_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  return()
-endif()
-
 find_path(MKL_INC_DIR mkl.h PATHS
   ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 37eececfd5..69220e03fe 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -69,7 +69,7 @@ endif(NOT WITH_GPU)
 
 if(WITH_MKLDNN)
     add_definitions(-DPADDLE_USE_MKLDNN)
-    if (WITH_MKL_LITE AND MKLDNN_IOMP_DIR)
+    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
         message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
         set(OPENMP_FLAGS "-fopenmp")
         set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 8ac68aa325..e3970073a1 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -34,9 +34,9 @@ IF(WITH_TESTING)
             "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
     ENDIF(WIN32)
 
-    IF(WITH_MKL_LITE)
-        # wait for mkl downloading completed
-        SET(GTEST_DEPENDS   ${MKL_LITE_PROJECT})
+    IF(WITH_MKLML)
+        # wait for mklml downloading completed
+        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
     ENDIF()
 
     ExternalProject_Add(
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 28a753e19a..9066b5abd5 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -38,10 +38,10 @@ ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
 
-IF(${CBLAS_PROVIDER} STREQUAL "MKL_LITE")
-    SET(MKLDNN_DEPENDS   ${MKL_LITE_PROJECT})
-    SET(MKLDNN_MKLROOT   ${MKL_LITE_ROOT})
-    SET(MKLDNN_IOMP_DIR  ${MKL_LITE_LIB_DIR})
+IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
+    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
+    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
 ENDIF()
 
 ExternalProject_Add(
diff --git a/cmake/external/mkllite.cmake b/cmake/external/mkllite.cmake
deleted file mode 100644
index e889290e36..0000000000
--- a/cmake/external/mkllite.cmake
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_MKL_LITE})
-  return()
-ENDIF(NOT ${WITH_MKL_LITE})
-
-INCLUDE(ExternalProject)
-
-SET(MKL_LITE_PROJECT       "extern_mkllite")
-SET(MKL_LITE_VER           "mklml_lnx_2018.0.20170425")
-SET(MKL_LITE_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKL_LITE_VER}.tgz")
-SET(MKL_LITE_DOWNLOAD_DIR  ${THIRD_PARTY_PATH}/mkllite)
-
-SET(MKL_LITE_ROOT          ${MKL_LITE_DOWNLOAD_DIR}/${MKL_LITE_VER})
-SET(MKL_LITE_INC_DIR       ${MKL_LITE_ROOT}/include)
-SET(MKL_LITE_LIB_DIR       ${MKL_LITE_ROOT}/lib)
-SET(MKL_LITE_LIB           ${MKL_LITE_LIB_DIR}/libmklml_intel.so)
-SET(MKL_LITE_IOMP_LIB      ${MKL_LITE_LIB_DIR}/libiomp5.so)
-SET(CMAKE_INSTALL_RPATH    "${CMAKE_INSTALL_RPATH}" "${MKL_LITE_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${MKL_LITE_INC_DIR})
-
-ExternalProject_Add(
-    ${MKL_LITE_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${MKL_LITE_DOWNLOAD_DIR}
-    DOWNLOAD_DIR          ${MKL_LITE_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKL_LITE_URL}
-                          && tar -xzf ${MKL_LITE_DOWNLOAD_DIR}/${MKL_LITE_VER}.tgz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    PATCH_COMMAND         ""
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    INSTALL_COMMAND       ""
-    TEST_COMMAND          ""
-)
-
-IF (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkllite_dummy.c)
-    FILE(WRITE ${dummyfile} "const char * dummy_mkllite = \"${dummyfile}\";")
-    ADD_LIBRARY(mkllite STATIC ${dummyfile})
-ELSE()
-    ADD_LIBRARY(mkllite INTERFACE)
-ENDIF()
-
-ADD_DEPENDENCIES(mkllite ${MKL_LITE_PROJECT})
-
-LIST(APPEND external_project_dependencies mkllite)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
new file mode 100644
index 0000000000..45b3f9d85b
--- /dev/null
+++ b/cmake/external/mklml.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLML})
+  return()
+ENDIF(NOT ${WITH_MKLML})
+
+INCLUDE(ExternalProject)
+
+SET(MKLML_PROJECT       "extern_mklml")
+SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_DOWNLOAD_DIR  ${THIRD_PARTY_PATH}/mklml)
+
+SET(MKLML_ROOT          ${MKLML_DOWNLOAD_DIR}/${MKLML_VER})
+SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
+SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
+SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
+SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH    "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+
+ExternalProject_Add(
+    ${MKLML_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
+                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    PATCH_COMMAND         ""
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    INSTALL_COMMAND       ""
+    TEST_COMMAND          ""
+)
+
+IF (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mklml_dummy.c)
+    FILE(WRITE ${dummyfile} "const char * dummy_mklml = \"${dummyfile}\";")
+    ADD_LIBRARY(mklml STATIC ${dummyfile})
+ELSE()
+    ADD_LIBRARY(mklml INTERFACE)
+ENDIF()
+
+ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+
+LIST(APPEND external_project_dependencies mklml)
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 999b72cc15..c8ba1074a1 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
 
-#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKL_LITE)
+#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 799948cf08..637643838f 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
-#ifdef PADDLE_USE_MKL_LITE
+#ifdef PADDLE_USE_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>

From 088e2200c4e054a9100109eacb291365d9b96ea7 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Thu, 20 Jul 2017 12:41:42 +0800
Subject: [PATCH 055/100] "remove unused fake fc op"

---
 paddle/framework/fully_connected_op.h | 54 ---------------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 paddle/framework/fully_connected_op.h

diff --git a/paddle/framework/fully_connected_op.h b/paddle/framework/fully_connected_op.h
deleted file mode 100644
index f049eda9bb..0000000000
--- a/paddle/framework/fully_connected_op.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <iostream>
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-class FCOp : public OperatorBase {
- public:
-  void Run(const ScopePtr& scope,
-           const platform::DeviceContext& dev_ctx) const override {
-    std::cout << "FC" << std::endl;
-  };
-  void InferShape(const ScopePtr& scope) const override{};
-};
-
-class FCOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  FCOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("x", "input data");
-    AddInput("w", "weights");
-    AddInput("b", "bias");
-    AddOutput("y", "output data");
-    AddComment("Fully connnect op");
-  }
-};
-
-class FCGradientOp : public OperatorBase {
-  void Run(const ScopePtr& scope,
-           const platform::DeviceContext& dev_ctx) const override {
-    std::cout << "FCGrad" << std::endl;
-  };
-  void InferShape(const ScopePtr& scope) const override{};
-};
-
-// class FCGradientOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {};
-REGISTER_OP(my_fc, FCOp, FCOpProtoAndCheckerMaker);
-REGISTER_GRADIENT_OP(my_fc_grad, FCGradientOp);
-
-}  // namespace framework
-}  // namespace paddle

From f6dd8ae1644608f7b3b986c9a3bc53c0e36d2fa6 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 20 Jul 2017 14:15:35 +0800
Subject: [PATCH 056/100] fix make warning compile with mkldnn

---
 cmake/external/mkldnn.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9066b5abd5..fba0756f3d 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -54,8 +54,8 @@ ExternalProject_Add(
     CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
     BUILD_COMMAND       cd <SOURCE_DIR>/build
                         && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
-                        && make all -j${CPU_CORES}
-    INSTALL_COMMAND     cd <SOURCE_DIR>/build && make install
+                        && $(MAKE)
+    INSTALL_COMMAND     cd <SOURCE_DIR>/build && $(MAKE) install
     UPDATE_COMMAND      ""
 )
 

From 9418717f80da0c01a39bee53b7eb4df7ca569134 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 20 Jul 2017 15:57:02 +0800
Subject: [PATCH 057/100] Fix compile errors

---
 paddle/framework/CMakeLists.txt     |  3 ++-
 paddle/framework/grad_op_creator.cc | 32 ++++++++++++++++++------
 paddle/framework/grad_op_creator.h  | 38 ++++++++++++++---------------
 paddle/framework/op_registry.h      | 14 +++++++++++
 4 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 9e9491d983..a43861f4cd 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,7 +19,8 @@ cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
+cc_library(grad_op_creator SRCS grad_op_creator.cc)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc grad_op_creator)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
diff --git a/paddle/framework/grad_op_creator.cc b/paddle/framework/grad_op_creator.cc
index ac3663b7fc..106c2eae9d 100644
--- a/paddle/framework/grad_op_creator.cc
+++ b/paddle/framework/grad_op_creator.cc
@@ -1,4 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/grad_op_creator.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -22,15 +37,15 @@ OpInOutArg* GradOpCreator::BuildArg(const VarProto& var,
 }
 
 void GradOpCreator::BuildOpInOutArgList() {
-  const OpProto& op_proto = OpRegistry::protos().at(op_->type);
-  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op->type_));
+  const OpProto& op_proto = OpRegistry::protos().at(op_->type_);
+  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_->type_));
   const std::vector<int>& in_format =
       op_->attrs_.count("input_format")
-          ? op->GetAttr<std::vector<int>>("input_format")
+          ? op_->GetAttr<std::vector<int>>("input_format")
           : std::vector<int>();
   const std::vector<int>& out_format =
       op_->attrs_.count("output_format")
-          ? op->GetAttr<std::vector<int>>("output_format")
+          ? op_->GetAttr<std::vector<int>>("output_format")
           : std::vector<int>();
   for (const auto& var : op_proto.inputs()) {
     arg_list_.emplace_back(
@@ -46,14 +61,15 @@ void GradOpCreator::AddArgIntoGradOp(const OpInOutArg* arg,
                                      std::vector<std::string>& in_out,
                                      std::vector<int>& format,
                                      VarIndexMap* varmap, int& idx,
-                                     bool is_grad) {
+                                     bool is_grad) const {
   std::string var_name = arg->proto_name_;
   if (is_grad) {
     var_name += OperatorBase::GRAD_VAR_SUFFIX();
   }
-  *(varmap)[var_name] = idx++;
+  (*varmap)[var_name] = idx++;
   size_t pre_sz = in_out.size();
-  auto base_it = arg->type == IN ? op_->inputs_.begin() : op_->outputs_.begin();
+  auto base_it =
+      arg->type_ == IN ? op_->inputs_.begin() : op_->outputs_.begin();
   std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
             std::back_inserter(in_out));
   if (is_grad) {
@@ -96,4 +112,4 @@ void GradOpCreator::CompleteGradOp(OperatorBase* grad_op) const {
 }
 
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/framework/grad_op_creator.h b/paddle/framework/grad_op_creator.h
index 456b066f1d..21b160a73f 100644
--- a/paddle/framework/grad_op_creator.h
+++ b/paddle/framework/grad_op_creator.h
@@ -7,6 +7,24 @@ namespace paddle {
 namespace framework {
 class OpRegistry;
 
+enum InOutType { IN, OUT };
+
+struct OpInOutArg {
+  OpInOutArg(const std::string& proto_name, const InOutType& type,
+             bool needed_in_grad, size_t begin_idx, size_t end_idx)
+      : proto_name_(proto_name),
+        type_(type),
+        needed_in_grad_(needed_in_grad),
+        begin_idx_(begin_idx),
+        end_idx_(end_idx) {}
+
+  std::string proto_name_;
+  InOutType type_;
+  bool needed_in_grad_;
+  size_t begin_idx_;
+  size_t end_idx_;
+};
+
 class GradOpCreator {
   using VarIndexMap = std::unordered_map<std::string, int>;
 
@@ -15,30 +33,12 @@ class GradOpCreator {
   OperatorBase* Create();
 
  private:
-  enum InOutType { IN, OUT };
-
-  struct OpInOutArg {
-    OpInOutArg(const std::string& proto_name, const InOutType& type,
-               bool needed_in_grad, size_t begin_idx, size_t end_idx)
-        : proto_name_(proto_name),
-          type_(type),
-          needed_in_grad_(needed_in_grad),
-          begin_idx_(begin_idx),
-          end_idx_(end_idx) {}
-
-    std::string proto_name_;
-    InOutType type_;
-    bool needed_in_grad_;
-    size_t begin_idx_;
-    size_t end_idx_;
-  };
-
   OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
                        const std::vector<int>& format, InOutType type);
   void BuildOpInOutArgList();
   void AddArgIntoGradOp(const OpInOutArg* arg, std::vector<std::string>& in_out,
                         std::vector<int>& format, VarIndexMap* varmap, int& idx,
-                        bool is_grad);
+                        bool is_grad) const;
   void CompleteGradOp(OperatorBase* grad_op) const;
   const OperatorBase* op_;
   std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 897238fc69..bbeeefb20c 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
 #include <algorithm>

From a7e23a4cc2a91859c295569164c9a9d2e576daa1 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 20 Jul 2017 16:47:37 +0800
Subject: [PATCH 058/100] fix link error static to shared

---
 CMakeLists.txt              |  2 +-
 cmake/external/mkldnn.cmake |  3 ++-
 cmake/external/mklml.cmake  | 11 ++---------
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62ab4669cb..4cdd8dbd77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,7 +145,7 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLML_LIB_IOMP})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
 endif()
 
 if(USE_NNPACK)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index fba0756f3d..2b74479273 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -41,6 +41,7 @@ INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
     SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
     SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
 ENDIF()
 
@@ -59,7 +60,7 @@ ExternalProject_Add(
     UPDATE_COMMAND      ""
 )
 
-ADD_LIBRARY(mkldnn STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 45b3f9d85b..dff51baec3 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -48,14 +48,7 @@ ExternalProject_Add(
     TEST_COMMAND          ""
 )
 
-IF (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mklml_dummy.c)
-    FILE(WRITE ${dummyfile} "const char * dummy_mklml = \"${dummyfile}\";")
-    ADD_LIBRARY(mklml STATIC ${dummyfile})
-ELSE()
-    ADD_LIBRARY(mklml INTERFACE)
-ENDIF()
-
+ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
-
 LIST(APPEND external_project_dependencies mklml)

From 19ec08fa866e6c4054dc16167c1e42ab13f22c20 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 20 Jul 2017 23:43:31 +0800
Subject: [PATCH 059/100] change install path of mkldnn and mklml

---
 cmake/external/mkldnn.cmake | 2 +-
 cmake/external/mklml.cmake  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 2b74479273..615b1ddf35 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ INCLUDE(ExternalProject)
 
 SET(MKLDNN_PROJECT "extern_mkldnn")
 SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
-SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
+SET(MKLDNN_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/opt/paddle/third_party/mkldnn")
 SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
 IF(WIN32)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index dff51baec3..e1925a07d0 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -21,7 +21,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
 SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
-SET(MKLML_DOWNLOAD_DIR  ${THIRD_PARTY_PATH}/mklml)
+SET(MKLML_DOWNLOAD_DIR  "${CMAKE_INSTALL_PREFIX}/opt/paddle/third_party/mklml")
 
 SET(MKLML_ROOT          ${MKLML_DOWNLOAD_DIR}/${MKLML_VER})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)

From 248149f44863820baeed54ca24b0231c46624402 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Fri, 21 Jul 2017 00:13:54 +0800
Subject: [PATCH 060/100] add depthwiseconv test and fix the little bug of the
 convOpTest

---
 paddle/function/ConvOpTest.cpp | 349 +++++++++++++++++++--------------
 1 file changed, 202 insertions(+), 147 deletions(-)

diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
index f0c45c97b1..7f32c73479 100644
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@@ -38,76 +38,76 @@ public:
         for (size_t filterSize : {1, 3, 5}) {
           for (size_t inputChannels : {3, 64}) {
             for (size_t outputChannels : {3, 64}) {
-              for (size_t groups : {1, 3, 64}) {
-                if (inputChannels > outputChannels) break;
-                if (groups != 1 &&
-                    (inputChannels != groups || outputChannels % groups != 0))
-                  continue;
-                if (!useGroups) groups = 1;
-
-                for (size_t stride : {1, 2}) {
-                  for (size_t padding : {0, 1}) {
-                    if (padding >= filterSize) break;
-                    size_t outputSize =
-                        (inputSize - filterSize + 2 * padding + stride) /
-                        stride;
-                    VLOG(3) << " batchSize=" << batchSize
-                            << " inputChannels=" << inputChannels
-                            << " inputHeight=" << inputSize
-                            << " inputWidth=" << inputSize
-                            << " outputChannels=" << outputChannels
-                            << " filterHeight=" << filterSize
-                            << " filterWidth=" << filterSize
-                            << " outputHeight=" << outputSize
-                            << " outputWidth=" << outputSize
-                            << " stride=" << stride << " padding=" << padding;
-
-                    std::vector<size_t> paddings = {padding, padding};
-                    std::vector<size_t> strides = {stride, stride};
-                    Compare2Function<DType1, DType2> test(
-                        conv1,
-                        conv2,
-                        FuncConfig()
-                            .set("paddings", paddings)
-                            .set("strides", strides)
-                            .set("groups", groups)
-                            .set("algo", algo));
-
-                    TensorShape input{
-                        batchSize, inputChannels, inputSize, inputSize};
-
-                    TensorShape filter;
-                    if (groups > 1)
-                      filter = TensorShape({groups,
-                                            outputChannels / groups,
-                                            inputChannels / groups,
-                                            filterSize,
-                                            filterSize});
-                    else
-                      filter = TensorShape({outputChannels,
-                                            inputChannels,
-                                            filterSize,
-                                            filterSize});
-                    TensorShape output{
-                        batchSize, outputChannels, outputSize, outputSize};
-
-                    if (type == kForwardTest) {
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                      test.run();
-                    } else if (type == kBackwardInputTest) {
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input),
-                                      ADD_TO);
-                      test.run();
-                    } else if (type == kBackwardFilterTest) {
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                      test.run();
-                    }
+              if (inputChannels > outputChannels) break;
+              size_t groups;
+              if (!useGroups) {
+                groups = 1;
+              } else {
+                if (outputChannels % inputChannels != 0) continue;
+                groups = inputChannels;
+              }
+
+              for (size_t stride : {1, 2}) {
+                for (size_t padding : {0, 1}) {
+                  if (padding >= filterSize) break;
+                  size_t outputSize =
+                      (inputSize - filterSize + 2 * padding + stride) / stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputSize
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;
+
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", groups)
+                          .set("algo", algo));
+
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+
+                  TensorShape filter;
+                  if (groups > 1)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          inputChannels / groups,
+                                          filterSize,
+                                          filterSize});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterSize,
+                                          filterSize});
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+
+                  if (type == kForwardTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.run();
+                  } else if (type == kBackwardInputTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                    test.run();
+                  } else if (type == kBackwardFilterTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
+                                    ADD_TO);
+                    test.run();
                   }
                 }
               }
@@ -136,77 +136,78 @@ public:
             for (size_t filterWidth : {3, 7}) {
               for (size_t inputChannels : {7}) {
                 for (size_t outputChannels : {7}) {
-                  for (size_t groups : {1, 7}) {
-                    if (groups != 1 && (inputChannels != groups ||
-                                        outputChannels % groups != 0))
-                      continue;
-                    if (!useGroups) groups = 1;
-
-                    size_t stride = 1;
-                    size_t padding = 0;
-                    size_t outputHeight =
-                        (inputHeight - filterHeight + 2 * padding + stride) /
-                        stride;
-                    size_t outputWidth =
-                        (inputWidth - filterWidth + 2 * padding + stride) /
-                        stride;
-                    VLOG(3) << " batchSize=" << batchSize
-                            << " inputChannels=" << inputChannels
-                            << " inputHeight=" << inputHeight
-                            << " inputWidth=" << inputWidth
-                            << " outputChannels=" << outputChannels
-                            << " filterHeight=" << filterHeight
-                            << " filterWidth=" << filterWidth
-                            << " outputHeight=" << outputHeight
-                            << " outputWidth=" << outputWidth
-                            << " stride=" << stride << " padding=" << padding;
-
-                    std::vector<size_t> paddings = {padding, padding};
-                    std::vector<size_t> strides = {stride, stride};
-                    Compare2Function<DType1, DType2> test(
-                        conv1,
-                        conv2,
-                        FuncConfig()
-                            .set("paddings", paddings)
-                            .set("strides", strides)
-                            .set("groups", groups)
-                            .set("algo", algo));
-
-                    TensorShape input{
-                        batchSize, inputChannels, inputHeight, inputWidth};
-
-                    TensorShape filter;
-                    if (groups > 1)
-                      filter = TensorShape({groups,
-                                            outputChannels / groups,
-                                            inputChannels / groups,
-                                            filterHeight,
-                                            filterWidth});
-                    else
-                      filter = TensorShape({outputChannels,
-                                            inputChannels,
-                                            filterHeight,
-                                            filterWidth});
-                    TensorShape output{
-                        batchSize, outputChannels, outputHeight, outputWidth};
-
-                    if (type == kForwardTest) {
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                      test.run();
-                    } else if (type == kBackwardInputTest) {
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input),
-                                      ADD_TO);
-                      test.run();
-                    } else if (type == kBackwardFilterTest) {
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                      test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                      test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                      test.run();
-                    }
+                  size_t groups;
+                  if (!useGroups) {
+                    groups = 1;
+                  } else {
+                    if (outputChannels % inputChannels != 0) continue;
+                    groups = inputChannels;
+                  }
+
+                  size_t stride = 1;
+                  size_t padding = 0;
+                  size_t outputHeight =
+                      (inputHeight - filterHeight + 2 * padding + stride) /
+                      stride;
+                  size_t outputWidth =
+                      (inputWidth - filterWidth + 2 * padding + stride) /
+                      stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputHeight
+                          << " inputWidth=" << inputWidth
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterHeight
+                          << " filterWidth=" << filterWidth
+                          << " outputHeight=" << outputHeight
+                          << " outputWidth=" << outputWidth
+                          << " stride=" << stride << " padding=" << padding;
+
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", groups)
+                          .set("algo", algo));
+
+                  TensorShape input{
+                      batchSize, inputChannels, inputHeight, inputWidth};
+
+                  TensorShape filter;
+                  if (groups > 1)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          inputChannels / groups,
+                                          filterHeight,
+                                          filterWidth});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterHeight,
+                                          filterWidth});
+                  TensorShape output{
+                      batchSize, outputChannels, outputHeight, outputWidth};
+
+                  if (type == kForwardTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.run();
+                  } else if (type == kBackwardInputTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                    test.run();
+                  } else if (type == kBackwardFilterTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
+                                    ADD_TO);
+                    test.run();
                   }
                 }
               }
@@ -218,6 +219,8 @@ public:
   }
 };
 
+// ======Start Convolution TEST======
+
 TEST(Forward, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
       "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
@@ -228,24 +231,76 @@ TEST(Forward, GEMM) {
 #ifndef PADDLE_ONLY_CPU
 TEST(Forward, GEMM2) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
 }
 
 TEST(BackwardInput, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      kBackwardInputTest,
+      false);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      kBackwardInputTest,
+      false);
 }
 
 TEST(BackwardFilter, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      kBackwardFilterTest,
+      false);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      kBackwardFilterTest,
+      false);
 }
 #endif
+// ======End Convolution TEST======
+
+// ======Start DepthwiseConvolution TEST======
+
+// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
+// version of depthwiseConv is implemented.
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(DepthwiseConvForward, GEMM2) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
+}
+
+TEST(DepthwiseConvBackwardInput, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradInput-CPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradInput-CPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+}
+
+TEST(DepthwiseConvBackwardFilter, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradFilter-CPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradFilter-CPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+}
+
+#endif
+// ======End DepthwiseConvolution TEST======
 
 }  // namespace paddle

From 2fd43fc5a0b701ce1a097c7267dab3145276fea6 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 21 Jul 2017 00:33:00 +0800
Subject: [PATCH 061/100] separate mklml download path and install path

---
 cmake/external/mklml.cmake | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index e1925a07d0..84629f01ac 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -21,9 +21,10 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
 SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
-SET(MKLML_DOWNLOAD_DIR  "${CMAKE_INSTALL_PREFIX}/opt/paddle/third_party/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_INSTALL_DIR   "${CMAKE_INSTALL_PREFIX}/opt/paddle/third_party/mklml")
 
-SET(MKLML_ROOT          ${MKLML_DOWNLOAD_DIR}/${MKLML_VER})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
 SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
@@ -38,7 +39,8 @@ ExternalProject_Add(
     PREFIX                ${MKLML_DOWNLOAD_DIR}
     DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
     DOWNLOAD_COMMAND      wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
-                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
+                          && mkdir -p ${MKLML_INSTALL_DIR}
+                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz -C ${MKLML_INSTALL_DIR}
     DOWNLOAD_NO_PROGRESS  1
     UPDATE_COMMAND        ""
     PATCH_COMMAND         ""

From a7e69d949f23c6025ba93578e29020fba694d08c Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Fri, 21 Jul 2017 00:55:58 +0000
Subject: [PATCH 062/100] do not do log.Errorln when checkpoint is not found
 (which is normal)

---
 go/cmd/pserver/pserver.go | 6 +++++-
 go/pserver/service.go     | 8 ++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 20094fbab4..aa81d0432b 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -59,7 +59,11 @@ func main() {
 
 		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
 		if err != nil {
-			log.Errorf("Fetch checkpoint failed, %s", err)
+			if err == pserver.ErrCheckpointNotFound {
+				log.Infof("Could not find the pserver checkpoint.")
+			} else {
+				log.Errorf("Fetch checkpoint failed, %s", err)
+			}
 		}
 	}
 
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 46738413f0..a7767afa63 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -36,6 +36,10 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
+// ErrCheckpointNotFound indicates that the pserver checkpoint could
+// not be found.
+var ErrCheckpointNotFound = errors.New("checkpoint not found")
+
 // RPC error message.
 const (
 	AlreadyInitialized  = "pserver already initialized"
@@ -103,6 +107,10 @@ func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, e
 		return nil, err
 	}
 
+	if len(v) == 0 {
+		return nil, ErrCheckpointNotFound
+	}
+
 	var cpMeta checkpointMeta
 	if err = json.Unmarshal(v, &cpMeta); err != nil {
 		return nil, err

From e8d171bbd2a0680425ab0512f9019bfb0d7d6a70 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Fri, 21 Jul 2017 10:33:28 +0800
Subject: [PATCH 063/100] add check for groups and inputChannels

---
 paddle/function/DepthwiseConvOp.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 9180c19b11..21084bedb4 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -125,6 +125,7 @@ public:
     size_t outputHeight = output[2];
     size_t outputWidth = output[3];
     size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
 
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
@@ -187,6 +188,7 @@ public:
     size_t outputHeight = output[2];
     size_t outputWidth = output[3];
     size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
 
     real* outputGrad = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
@@ -248,6 +250,7 @@ public:
     size_t outputHeight = output[2];
     size_t outputWidth = output[3];
     size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
 
     real* outputGrad = inputs[0].data<real>();
     real* inputData = inputs[1].data<real>();

From 4736b239d978f5def9ef2dc3e13a7c8dea12f35d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 21 Jul 2017 11:25:11 +0800
Subject: [PATCH 064/100] Add a simple test for grad_op_creator

---
 paddle/framework/CMakeLists.txt          |  1 +
 paddle/framework/grad_op_creator_test.cc | 25 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 paddle/framework/grad_op_creator_test.cc

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index a43861f4cd..36da6f649b 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -22,6 +22,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(grad_op_creator SRCS grad_op_creator.cc)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc grad_op_creator)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
+cc_test(grad_op_creator_test SRCS grad_op_creator_test.cc DEPS grad_op_creator op_registry operator add_op)
 
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
diff --git a/paddle/framework/grad_op_creator_test.cc b/paddle/framework/grad_op_creator_test.cc
new file mode 100644
index 0000000000..ad836727c3
--- /dev/null
+++ b/paddle/framework/grad_op_creator_test.cc
@@ -0,0 +1,25 @@
+#include "paddle/framework/grad_op_creator.h"
+#include <gtest/gtest.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+USE_OP(add_two);
+
+namespace paddle {
+namespace framework {
+
+TEST(GradOpCreator, AddTwo) {
+  OperatorPtr add_op(OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
+  OperatorPtr grad_add_op = OpRegistry::CreateGradOp(add_op);
+  EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
+  EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
+  EXPECT_EQ(grad_add_op->Input("X"), "x");
+  EXPECT_EQ(grad_add_op->Input("Y"), "y");
+  EXPECT_EQ(grad_add_op->Input("Out"), "out");
+  EXPECT_EQ(grad_add_op->Input("Out@GRAD"), "out@GRAD");
+  EXPECT_EQ(grad_add_op->Output("X@GRAD"), "x@GRAD");
+  EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD");
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file

From 6c528cbc2acd6f24c56c7ef5bcb0e29702092df3 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Fri, 21 Jul 2017 11:37:19 +0800
Subject: [PATCH 065/100] add check: CHECK_EQ(outputs[0].getArgType(), ADD_TO)

---
 paddle/function/DepthwiseConvOp.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 21084bedb4..490e8d546c 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -172,6 +172,7 @@ public:
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(numInputs_, inputs.size());
     CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     check(inputs, outputs);
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     const TensorShape& output = inputs[0].shape();
@@ -235,6 +236,7 @@ public:
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(numInputs_, inputs.size());
     CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     check(inputs, outputs);
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();

From da07ec1886669deb71931d6bb949d20c01033605 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 21 Jul 2017 11:58:42 +0800
Subject: [PATCH 066/100] Update Tensor and PODDeleter's template parameter

1. Change PODDeleter's template parameter 'PlaceType' to 'Place'.

2. Limit PODDeleter and Tensor::mutable_data()'s `T` to POD type.
---
 paddle/framework/tensor.h | 10 ++++++----
 paddle/memory/memory.h    | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 7d0f74fc5b..a36f375d2e 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -60,13 +60,15 @@ class Tensor {
                                 offset_);
   }
 
-  template <typename T>
+  template <typename T,  // must be POD types
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(DDim dims, platform::Place place) {
     Resize(dims);
     return mutable_data<T>(place);
   }
 
-  template <typename T>
+  template <typename T,  // must be POD types
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(platform::Place place) {
     PADDLE_ENFORCE(product(dims_) > 0,
                    "Tensor's numel must be larger than zero to call "
@@ -150,7 +152,7 @@ class Tensor {
   struct PlaceholderImpl : public Placeholder {
     PlaceholderImpl(PlaceType place, size_t size)
         : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               memory::PodDeleter<T, PlaceType>(place)),
+               memory::PODDeleter<T, PlaceType>(place)),
           place_(place),
           size_(size) {}
 
@@ -159,7 +161,7 @@ class Tensor {
     virtual paddle::platform::Place place() const { return place_; }
     virtual std::type_index type() const { return std::type_index(typeid(T)); }
 
-    std::unique_ptr<T, memory::PodDeleter<T, PlaceType>> ptr_;
+    std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
     platform::Place place_;  // record the place of ptr_.
     size_t size_;            // size of the memory block.
   };
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index f5890fb844..c4fe1e5220 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -28,14 +28,16 @@ void Free(Place, void*);
 template <class Place>
 size_t Used(Place);
 
-template <typename T, typename PlaceType>
-class PodDeleter {
+template <typename T,
+          typename Place /* platform::GPUPlace or platform::CPUPlace */,
+          typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+class PODDeleter {
  public:
-  PodDeleter(PlaceType place) : place_(place) {}
+  PODDeleter(Place place) : place_(place) {}
   void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
 
  private:
-  PlaceType place_;
+  Place place_;
 };
 
 }  // namespace memory

From bf3b8f045eb52a6b2d12ac6fbdca34c4af11b64d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 21 Jul 2017 13:17:18 +0800
Subject: [PATCH 067/100] Retrigger CI

---
 paddle/memory/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index c4fe1e5220..d0a64869cc 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -28,7 +28,7 @@ void Free(Place, void*);
 template <class Place>
 size_t Used(Place);
 
-template <typename T,
+template <typename T, /* must be POD types */
           typename Place /* platform::GPUPlace or platform::CPUPlace */,
           typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
 class PODDeleter {

From 06358129a242c2cb58bf91c81603eccaac7b8df5 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 21 Jul 2017 13:27:34 +0800
Subject: [PATCH 068/100] Fix compile error

---
 paddle/operators/cross_entropy_op.cc | 2 +-
 paddle/operators/sgd_op.cc           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index fe669b03ca..7d7bb09f3d 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -35,7 +35,7 @@ protected:
     PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2.");
     PADDLE_ENFORCE(outputs[0]->dims().size() == 1,
                    "label's dimension must be 1.");
-    outputs[0]->set_dims(framework::make_ddim({inputs[0]->dims()[0]}));
+    outputs[0]->Resize(framework::make_ddim({inputs[0]->dims()[0]}));
   }
 };
 
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 04df87a3ad..66ab1e0011 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -31,7 +31,7 @@ protected:
     PADDLE_ENFORCE(outputs[0] != nullptr, "outputs[0] mast be set");
     PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
                    "Two input of SGD Op's dimension must be same.");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
   }
 };
 

From 75f66768bb94bf09f6b61ddfddaf35cadd2bc48b Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 21 Jul 2017 13:30:56 +0800
Subject: [PATCH 069/100] fix the permission issue of installing mklml

---
 cmake/external/mklml.cmake | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 84629f01ac..c8a20ad464 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -21,33 +21,37 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
 SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
-SET(MKLML_DOWNLOAD_DIR  "${THIRD_PARTY_PATH}/mklml")
-SET(MKLML_INSTALL_DIR   "${CMAKE_INSTALL_PREFIX}/opt/paddle/third_party/mklml")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
+SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
+SET(MKLML_INSTALL_DIR   "${CMAKE_INSTALL_PREFIX}/${MKLML_DST_DIR}")
 
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
 SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
 SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
-SET(CMAKE_INSTALL_RPATH    "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 
 INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 
+SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
+FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
+                              "cmake_minimum_required(VERSION 3.0)\n"
+                              "install(DIRECTORY ${MKLML_VER}\n"
+                              "        DESTINATION ${MKLML_DST_DIR})\n")
+
 ExternalProject_Add(
     ${MKLML_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${MKLML_DOWNLOAD_DIR}
+    PREFIX                ${MKLML_SOURCE_DIR}
     DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
     DOWNLOAD_COMMAND      wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
-                          && mkdir -p ${MKLML_INSTALL_DIR}
-                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz -C ${MKLML_INSTALL_DIR}
+                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
     DOWNLOAD_NO_PROGRESS  1
     UPDATE_COMMAND        ""
-    PATCH_COMMAND         ""
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    INSTALL_COMMAND       ""
-    TEST_COMMAND          ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} 
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX}
 )
 
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)

From 1cf50574c34167e9c9a0de85fa4dc99b777ad4cd Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 21 Jul 2017 13:50:20 +0800
Subject: [PATCH 070/100] defualt disable mkldnn and mklml

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4cdd8dbd77..8012a0df89 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,8 +37,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)

From 06acd6d0cb6d93ce75d91a4fc5d3adccb2f94ff1 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 21 Jul 2017 14:36:53 +0800
Subject: [PATCH 071/100] add unittest for some basic OpKernels

---
 paddle/operators/sigmoid_op.cc                |  2 +-
 paddle/pybind/pybind.cc                       |  4 ++++
 .../paddle/v2/framework/tests/CMakeLists.txt  | 17 +++++++++++---
 .../paddle/v2/framework/tests/op_test_util.py |  5 +++-
 .../paddle/v2/framework/tests/test_mul_op.py  | 17 ++++++++++++++
 .../v2/framework/tests/test_rowwise_add_op.py | 17 ++++++++++++++
 .../v2/framework/tests/test_sigmoid_op.py     | 16 +++++++++++++
 .../v2/framework/tests/test_softmax_op.py     | 23 +++++++++++++++++++
 8 files changed, 96 insertions(+), 5 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_mul_op.py
 create mode 100644 python/paddle/v2/framework/tests/test_rowwise_add_op.py
 create mode 100644 python/paddle/v2/framework/tests/test_sigmoid_op.py
 create mode 100644 python/paddle/v2/framework/tests/test_softmax_op.py

diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 589b48ce80..53bf0a4c28 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -34,7 +34,7 @@ public:
                  framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "sigmoid input");
-    AddInput("Y", "sigmoid output");
+    AddOutput("Y", "sigmoid output");
     AddComment("Sigmoid function");
   }
 };
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 4db9cc7446..a689092e7e 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -30,6 +30,10 @@ USE_OP(add_two);
 USE_OP(onehot_cross_entropy);
 USE_OP_WITHOUT_KERNEL(fc);
 USE_OP(sgd);
+USE_OP(mul);
+USE_OP(sigmoid);
+USE_OP(softmax);
+USE_OP(rowwise_add);
 
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of Paddle Paddle");
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 01838b40bd..aa67792ebc 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1,3 +1,14 @@
-add_python_test(test_framework test_protobuf.py test_scope.py
-    test_default_scope_funcs.py test_op_creation_methods.py
-    test_tensor.py test_fc_op.py test_add_two_op.py test_sgd_op.py test_cross_entropy_op.py)
+add_python_test(test_framework
+    test_protobuf.py
+    test_scope.py
+    test_default_scope_funcs.py
+    test_op_creation_methods.py
+    test_tensor.py
+    test_fc_op.py
+    test_add_two_op.py
+    test_sgd_op.py
+    test_cross_entropy_op.py
+    test_mul_op.py
+    test_sigmoid_op.py
+    test_softmax_op.py
+    test_rowwise_add_op.py)
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index b1fa12cc89..7b62313f8a 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -56,7 +56,10 @@ class OpTestMeta(type):
             for out_name in func.all_output_args:
                 actual = numpy.array(scope.get_var(out_name).get_tensor())
                 expect = getattr(self, out_name)
-                numpy.testing.assert_almost_equal(actual, expect)
+                # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
+                # has some diff, and could not pass unittest. So I set decimal 3 here.
+                # And I will check this in future.
+                numpy.testing.assert_almost_equal(actual, expect, decimal=3)
 
         obj.test_all = test_all
         return obj
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py
new file mode 100644
index 0000000000..0a87e66cd0
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -0,0 +1,17 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestMulOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "mul"
+        self.X = np.random.random((32, 784)).astype("float32")
+        self.Y = np.random.random((784, 100)).astype("float32")
+        self.Out = np.dot(self.X, self.Y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
new file mode 100644
index 0000000000..ef1514983c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -0,0 +1,17 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestRowwiseAddOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "rowwise_add"
+        self.X = np.random.random((32, 784)).astype("float32")
+        self.b = np.random.random(784).astype("float32")
+        self.Out = np.add(self.X, self.b)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_op.py b/python/paddle/v2/framework/tests/test_sigmoid_op.py
new file mode 100644
index 0000000000..50044a122f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -0,0 +1,16 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestSigmoidOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "sigmoid"
+        self.X = np.random.random((32, 100)).astype("float32")
+        self.Y = 1 / (1 + np.exp(-self.X))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py
new file mode 100644
index 0000000000..191b698c1c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -0,0 +1,23 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestSoftmaxOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "softmax"
+        self.X = np.random.random((32, 100)).astype("float32")
+        self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 95ce961cef4166f69da78d42fff3633b62d3bbc5 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 21 Jul 2017 16:04:13 +0800
Subject: [PATCH 072/100] fix permission issue

---
 CMakeLists.txt              |  6 +++---
 cmake/external/mkldnn.cmake | 13 +++++++++----
 cmake/external/mklml.cmake  | 10 +++++++---
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8012a0df89..127493bce8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,8 +37,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -94,7 +94,7 @@ endif()
 
 ########################################################################################
 
-include(external/mklml)   # download mklml package
+include(external/mklml)     # download mklml package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 615b1ddf35..eff15de73f 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -18,10 +18,15 @@ ENDIF(NOT ${WITH_MKLDNN})
 
 INCLUDE(ExternalProject)
 
-SET(MKLDNN_PROJECT "extern_mkldnn")
-SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
-SET(MKLDNN_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/opt/paddle/third_party/mkldnn")
-SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+SET(MKLDNN_PROJECT        "extern_mkldnn")
+SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
+SET(MKLDNN_INSTALL_ROOT   ${CMAKE_INSTALL_PREFIX})
+IF(NOT "$ENV{HOME}" STREQUAL "/root")
+    SET(MKLDNN_INSTALL_ROOT  "$ENV{HOME}")
+ENDIF()
+
+SET(MKLDNN_INSTALL_DIR    "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
+SET(MKLDNN_INCLUDE_DIR    "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
 IF(WIN32)
     MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index c8a20ad464..3f940756a4 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -24,8 +24,12 @@ SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
-SET(MKLML_INSTALL_DIR   "${CMAKE_INSTALL_PREFIX}/${MKLML_DST_DIR}")
+SET(MKLML_INSTALL_ROOT  "${CMAKE_INSTALL_PREFIX}")
+IF(NOT "$ENV{HOME}" STREQUAL "/root")
+    SET(MKLML_INSTALL_ROOT  "$ENV{HOME}")
+ENDIF()
 
+SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
@@ -50,8 +54,8 @@ ExternalProject_Add(
                           && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
     DOWNLOAD_NO_PROGRESS  1
     UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} 
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX}
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} 
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
 )
 
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)

From a183a80e2e41181272c72dd2afb6eb13aa9a31b7 Mon Sep 17 00:00:00 2001
From: zlx <zlx_hg@163.com>
Date: Fri, 21 Jul 2017 16:14:36 +0800
Subject: [PATCH 073/100] add param_attr  for img_conv_group(...)

---
 python/paddle/trainer_config_helpers/networks.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 1bf59ed484..789d4f170f 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -340,7 +340,8 @@ def img_conv_group(input,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=0,
                    pool_stride=1,
-                   pool_type=None):
+                   pool_type=None,
+                   param_attr=None):
     """
     Image Convolution Group, Used for vgg net.
 
@@ -357,6 +358,7 @@ def img_conv_group(input,
     :param conv_with_batchnorm:
     :param pool_stride:
     :param pool_type:
+    :param param_attr:
     :return:
     """
     tmp = input
@@ -397,6 +399,7 @@ def img_conv_group(input,
             padding=conv_padding[i],
             filter_size=conv_filter_size[i],
             num_filters=conv_num_filter[i],
+            param_attr = param_attr,
             **extra_kwargs)
 
         # logger.debug("tmp.num_filters = %d" % tmp.num_filters)

From 21a3c9d6f49c20a5ff1945f20c918326b10065a8 Mon Sep 17 00:00:00 2001
From: zlx <zlx_hg@163.com>
Date: Fri, 21 Jul 2017 16:55:22 +0800
Subject: [PATCH 074/100] add the comments for img_conv_groups

---
 .../paddle/trainer_config_helpers/networks.py | 49 ++++++++++++-------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 1032569b2b..95f3a3f8f3 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -345,21 +345,34 @@ def img_conv_group(input,
     """
     Image Convolution Group, Used for vgg net.
 
-    TODO(yuyang18): Complete docs
-
-    :param conv_batchnorm_drop_rate:
-    :param input:
-    :param conv_num_filter:
-    :param pool_size:
-    :param num_channels:
-    :param conv_padding:
-    :param conv_filter_size:
-    :param conv_act:
-    :param conv_with_batchnorm:
-    :param pool_stride:
-    :param pool_type:
-    :param param_attr:
-    :return:
+    :param conv_batchnorm_drop_rate: if conv_with_batchnorm[i] is true,
+        conv_batchnorm_drop_rate[i] represents the drop rate of each batch norm.
+    :type conv_batchnorm_drop_rate: list
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param conv_num_filter: output channels num.
+    :type conv_num_filter: int
+    :param pool_size: pooling filter size.
+    :type pool_size: int
+    :param num_channels: input channels num.
+    :type num_channels: int
+    :param conv_padding: convolution padding size.
+    :type conv_padding: int
+    :param conv_filter_size: convolution filter size.
+    :type conv_filter_size: int
+    :param conv_act: activation funciton after convolution.
+    :type conv_act: BaseActivation
+    :param conv_with_batchnorm: conv_with_batchnorm[i] represents
+        if there is a batch normalization after each convolution.
+    :type conv_with_batchnorm: list
+    :param pool_stride: pooling stride size.
+    :type pool_stride: int
+    :param pool_type: pooling type.
+    :type pool_type: BasePoolingType
+    :param param_attr: see img_conv_layer for details.
+    :type param_attr: ParameterAttribute
+    :return: Layer's output
+    :type: LayerOutput
     """
     tmp = input
 
@@ -399,7 +412,7 @@ def img_conv_group(input,
             padding=conv_padding[i],
             filter_size=conv_filter_size[i],
             num_filters=conv_num_filter[i],
-            param_attr = param_attr,
+            param_attr=param_attr,
             **extra_kwargs)
 
         # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
@@ -1392,7 +1405,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(*[l.name for l in layers])
+    Inputs(* [l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1442,7 +1455,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
+        Outputs(* [l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 19c465bb5b2b3405dce3c725e2c7aedba4e35117 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 21 Jul 2017 17:23:51 +0800
Subject: [PATCH 075/100] as aligned, defualt set mkldnn and mklml OFF

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 127493bce8..c6d2ce57f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,8 +37,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)

From 5f32cc10c2fa03c0c652ade733518434fe6de12a Mon Sep 17 00:00:00 2001
From: Zhaolong Xing <ZLX_HG@163.com>
Date: Fri, 21 Jul 2017 18:18:08 +0800
Subject: [PATCH 076/100] Update networks.py

modify the format
---
 python/paddle/trainer_config_helpers/networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 95f3a3f8f3..2272c052f8 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1405,7 +1405,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1455,7 +1455,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 045470cf003b6a46339eb551593e1d5c25dd944e Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Fri, 21 Jul 2017 10:40:07 -0700
Subject: [PATCH 077/100] Slight simplification of eigen.h in hope to make it
 more readable.

---
 paddle/framework/eigen.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
index 2599b29508..5f3358c69b 100644
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -61,25 +61,24 @@ struct EigenTensor {
   }
 };
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
+
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten is to reshape a Tensor into a one dimension EigenVector
-  using Parent = EigenTensor<T, 1, MajorType, IndexType>;
-  static typename Parent::Type Flatten(Tensor& tensor) {
-    return Parent::From(tensor,
-                        make_ddim({static_cast<int>(product(tensor.dims_))}));
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(Tensor& tensor) {
+    return EigenVector::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
   }
 
-  static typename Parent::ConstType Flatten(const Tensor& tensor) {
-    return Parent::From(tensor,
-                        make_ddim({static_cast<int>(product(tensor.dims_))}));
+  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
+    return EigenVector::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
   }
 };
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = EigenTensor<T, 2, MajorType, IndexType>;
-
 }  // namespace framework
 }  // namespace paddle

From 858dea8834bf54499c4954c56080c9de62214d52 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Fri, 21 Jul 2017 11:57:13 -0700
Subject: [PATCH 078/100] Move memory::Copy out from memory.h into memcpy.h

---
 paddle/memory/CMakeLists.txt |  1 +
 paddle/memory/memcpy.cc      | 67 ++++++++++++++++++++++++++++++++++++
 paddle/memory/memcpy.h       | 33 ++++++++++++++++++
 paddle/memory/memory.cc      | 42 ----------------------
 paddle/memory/memory.h       |  9 -----
 5 files changed, 101 insertions(+), 51 deletions(-)
 create mode 100644 paddle/memory/memcpy.cc
 create mode 100644 paddle/memory/memcpy.h

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index fac442cca5..a5c4420ac0 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(detail)
 
 cc_library(memory SRCS memory.cc)
+cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
     DEPS
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
new file mode 100644
index 0000000000..804369de82
--- /dev/null
+++ b/paddle/memory/memcpy.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memcpy.h"
+
+#include <cstring>  // for memcpy
+
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace memory {
+
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::GPUPlaceGuard g(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::GPUPlaceGuard g(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::GPUPlaceGuard g(src_place.device);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
new file mode 100644
index 0000000000..99b1c2e1c3
--- /dev/null
+++ b/paddle/memory/memcpy.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifndef PADDLE_ONLY_CPU
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 78443cc35a..c2e046926f 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -46,13 +46,6 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
-template <>
-void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::CPUPlace,
-                                                  const void* src, size_t num) {
-  std::memcpy(dst, src, num);
-}
-
 #ifndef PADDLE_ONLY_CPU
 
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
@@ -85,41 +78,6 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
 }
 
-template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
-  platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
-  if (dst_place == src_place) {
-    platform::SetDeviceId(src_place.device);
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
-  } else {
-    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
-                            stream);
-  }
-}
-
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace memory
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 7ef7a73bc8..5e0d647072 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -29,15 +29,6 @@ void Free(Place, void*);
 template <typename Place>
 size_t Used(Place);
 
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
-
-#ifndef PADDLE_ONLY_CPU
-template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
-          cudaStream_t stream);
-#endif  // PADDLE_ONLY_CPU
-
 template <typename T, /* must be POD types */
           typename Place /* platform::GPUPlace or platform::CPUPlace */,
           typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>

From 51152adf0b0817182167654779f0db46260496a7 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Fri, 21 Jul 2017 11:59:07 -0700
Subject: [PATCH 079/100] Fix end of namespace

---
 paddle/memory/memcpy.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index 804369de82..098931c887 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -65,3 +65,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
 }
 
 #endif  // PADDLE_ONLY_CPU
+
+}  // namespace memory
+}  // namespace paddle

From 616a236581e2e65dcaf72b3dab33bb7bfd3ea7a1 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Sat, 22 Jul 2017 08:34:06 -0700
Subject: [PATCH 080/100] Add target memcpy into paddle_memory

---
 paddle/memory/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index a5c4420ac0..4cd40e2c2a 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -5,7 +5,9 @@ cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
     DEPS
-    memory meta_data
+    memory
+    memcpy
+    meta_data
     meta_cache memory_block
     buddy_allocator system_allocator)
 

From f81caa4e1114ef78afa15730f7963c289418fe63 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Sat, 22 Jul 2017 16:14:15 -0700
Subject: [PATCH 081/100] Add dependency memory->device_context, because we now
 use platform::GPUPlaceGuard

---
 paddle/memory/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 4cd40e2c2a..3e53d1ce65 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -8,7 +8,10 @@ cc_library(paddle_memory
     memory
     memcpy
     meta_data
-    meta_cache memory_block
-    buddy_allocator system_allocator)
+    meta_cache
+    memory_block
+    buddy_allocator
+    system_allocator
+    device_context)
 
 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)

From de6f9c487952c9f9668c2438d3ee901e12c4c8eb Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Sat, 22 Jul 2017 16:15:23 -0700
Subject: [PATCH 082/100] Add dependency memory->device_context, because we now
 use platform::GPUPlaceGuard

---
 paddle/memory/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3e53d1ce65..8035d93bfe 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
 cc_library(memory SRCS memory.cc)
-cc_library(memcpy SRCS memcpy.cc)
+cc_library(memcpy SRCS memcpy.cc DEPS device_context)
 
 cc_library(paddle_memory
     DEPS
@@ -11,7 +11,6 @@ cc_library(paddle_memory
     meta_cache
     memory_block
     buddy_allocator
-    system_allocator
-    device_context)
+    system_allocator)
 
 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)

From 1294b3c53e26ccfa648535ef82fe3bc78e170348 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sun, 23 Jul 2017 21:26:50 -0500
Subject: [PATCH 083/100] Expose Net to Python (#2967)

* Expose Net to Python

* Expose PlainNet to Python, make python can add_op, complete_add_op
* Provide a low level api to manipulate Net
* Unittest for Net::DebugString
---
 paddle/framework/net.cc                       | 22 +++---
 paddle/pybind/pybind.cc                       | 71 +++++++++++++------
 .../paddle/v2/framework/tests/CMakeLists.txt  |  1 +
 .../v2/framework/tests/test_plain_net.py      | 30 ++++++++
 4 files changed, 95 insertions(+), 29 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_plain_net.py

diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index 501536657d..139425b356 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -39,19 +39,22 @@ void PlainNet::CompleteAddOp(bool calc) {
       output_set.insert(opt);
     }
   }
+
   inputs_.reserve(input_set.size());
   std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+  std::sort(inputs_.begin(), inputs_.end());
 
   outputs_.reserve(output_set.size());
+  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs_));
+  std::sort(outputs_.begin(), outputs_.end());
+
   std::vector<int> tmp_index;
   tmp_index.reserve(temp_output.size());
-  int idx = 0;
-  for (auto& opt : output_set) {
-    if (Contains(temp_output, opt)) {
-      tmp_index.push_back(idx);
+  int output_len = static_cast<int>(outputs_.size());
+  for (int i = 0; i < output_len; ++i) {
+    if (Contains(temp_output, outputs_[i])) {
+      tmp_index.push_back(i);
     }
-    outputs_.push_back(opt);
-    ++idx;
   }
 
   attrs_["temporary_index"] = tmp_index;
@@ -59,9 +62,12 @@ void PlainNet::CompleteAddOp(bool calc) {
 
 std::string PlainNet::DebugString() const {
   std::ostringstream os;
-  os << this->type_ << ":" << std::endl;
+  os << OperatorBase::DebugString() << std::endl;
   for (auto& op : ops_) {
-    os << "\t" << op->DebugString() << std::endl;
+    std::istringstream is(op->DebugString());
+    for (std::string line; std::getline(is, line);) {
+      os << "    " << line << std::endl;
+    }
   }
   return os.str();
 }
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 7a21588170..2c843839ce 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <Python.h>
-#include <paddle/framework/op_registry.h>
-#include <paddle/framework/operator.h>
-#include <paddle/framework/scope.h>
-#include <paddle/pybind/tensor_bind.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 #include <fstream>
 #include <vector>
 
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/scope.h"
+#include "paddle/pybind/tensor_bind.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
 namespace py = pybind11;
 namespace pd = paddle::framework;
 
@@ -35,8 +37,19 @@ USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
 
+template <typename ClassType>
+void ExposeOperator(ClassType& m) {
+  m.def("infer_shape", &ClassType::type::InferShape)
+      .def("run", &ClassType::type::Run)
+      .def("outputs",
+           [](const typename ClassType::type& op) -> std::vector<std::string> {
+             return op.outputs_;
+           })
+      .def("__str__", &ClassType::type::DebugString);
+}
+
 PYBIND11_PLUGIN(core) {
-  py::module m("core", "C++ core of Paddle Paddle");
+  py::module m("core", "C++ core of PaddlePaddle");
 
   py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer([](pd::Tensor& self) -> py::buffer_info {
@@ -113,21 +126,37 @@ All parameter, weight, gradient are variables in Paddle.
         return new paddle::platform::CPUDeviceContext();
       });
 
-  py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
-      .def("__str__", &pd::OperatorBase::DebugString)
+  py::class_<pd::OperatorBase, pd::OperatorPtr> operator_base(m, "Operator");
+
+  operator_base.def_static("create", [](py::bytes protobin) -> pd::OperatorPtr {
+    pd::OpDesc desc;
+    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                   "Cannot parse user input to OpDesc");
+    PADDLE_ENFORCE(desc.IsInitialized(),
+                   "User OpDesc is not initialized, reason %s",
+                   desc.InitializationErrorString());
+    return pd::OpRegistry::CreateOp(desc);
+  });
+  ExposeOperator(operator_base);
+
+  using PlainNetPtr = std::shared_ptr<pd::PlainNet>;
+  py::class_<pd::PlainNet, PlainNetPtr> plain_net(m, "PlainNet");
+
+  plain_net
       .def_static("create",
-                  [](py::bytes protobin) {
-                    pd::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    return pd::OpRegistry::CreateOp(desc);
+                  []() -> std::shared_ptr<pd::PlainNet> {
+                    auto retv = std::make_shared<pd::PlainNet>();
+                    retv->type_ = "plain_net";
+                    return retv;
                   })
-      .def("infer_shape", &pd::OperatorBase::InferShape)
-      .def("run", &pd::OperatorBase::Run)
-      .def("outputs", [](const pd::OperatorPtr& op) { return op->outputs_; });
+      .def("add_op", &pd::PlainNet::AddOp)
+      .def("add_op",
+           [](PlainNetPtr& self, const PlainNetPtr& plain_net) -> void {
+             self->AddOp(std::static_pointer_cast<pd::OperatorBase>(plain_net));
+           })
+      .def("complete_add_op", &pd::PlainNet::CompleteAddOp)
+      .def("complete_add_op", [](PlainNetPtr& self) { self->CompleteAddOp(); });
+  ExposeOperator(plain_net);
 
   return m.ptr();
 }
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index aa67792ebc..b3eb2ef8a8 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -3,6 +3,7 @@ add_python_test(test_framework
     test_scope.py
     test_default_scope_funcs.py
     test_op_creation_methods.py
+    test_plain_net.py
     test_tensor.py
     test_fc_op.py
     test_add_two_op.py
diff --git a/python/paddle/v2/framework/tests/test_plain_net.py b/python/paddle/v2/framework/tests/test_plain_net.py
new file mode 100644
index 0000000000..2b919aca28
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_plain_net.py
@@ -0,0 +1,30 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.create_op_creation_methods import op_creations
+import unittest
+
+
+class TestNet(unittest.TestCase):
+    def test_net_all(self):
+        net = core.PlainNet.create()
+        op1 = op_creations.add_two(X="X", Y="Y", Out="Out")
+        net.add_op(op1)
+
+        net2 = core.PlainNet.create()
+        net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out"))
+        net2.complete_add_op(True)
+        net.add_op(net2)
+        net.complete_add_op(True)
+
+        expected = '''
+Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
+    Op(add_two), inputs:(X, Y), outputs:(Out).
+    Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
+        Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
+            Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
+            Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
+'''
+        self.assertEqual(expected, "\n" + str(net))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 18f4d24d0d111851b890c7a90020c0ddb115b74c Mon Sep 17 00:00:00 2001
From: zlx <zlx_hg@163.com>
Date: Mon, 24 Jul 2017 10:46:25 +0800
Subject: [PATCH 084/100] moidify comment of im_conv_group

---
 python/paddle/trainer_config_helpers/networks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 95f3a3f8f3..28a71cf788 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -369,7 +369,8 @@ def img_conv_group(input,
     :type pool_stride: int
     :param pool_type: pooling type.
     :type pool_type: BasePoolingType
-    :param param_attr: see img_conv_layer for details.
+    :param param_attr: Convolution param attribute.
+        None means default attribute.
     :type param_attr: ParameterAttribute
     :return: Layer's output
     :type: LayerOutput

From c2543f5b29df028e9eceec0273b882484998c03a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 24 Jul 2017 15:20:29 +0800
Subject: [PATCH 085/100] Remove ScopePtr and OperatorPtr

* ScopePtr means pointer of scope, but it can be shared or uniqued.
Change it to std::shared_ptr<Scope> to make code better to read.
---
 paddle/framework/net.h               | 10 +++++-----
 paddle/framework/net_op_test.cc      |  4 ++--
 paddle/framework/op_registry.h       | 12 ++++++------
 paddle/framework/op_registry_test.cc | 24 ++++++++++--------------
 paddle/framework/operator.h          |  7 +++----
 paddle/framework/operator_test.cc    | 12 +++++-------
 paddle/framework/scope.h             |  5 ++---
 paddle/pybind/pybind.cc              |  5 +++--
 8 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 19c5fa223b..b2c64a8675 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -39,7 +39,7 @@ namespace framework {
  */
 class Net : public OperatorBase {
  public:
-  virtual void AddOp(const OperatorPtr& op) = 0;
+  virtual void AddOp(const std::shared_ptr<OperatorBase>& op) = 0;
   virtual void CompleteAddOp(bool calc) = 0;
 };
 
@@ -57,7 +57,7 @@ class PlainNet : public Net {
    * Infer all the operators' input and output variables' shapes, will be called
    * before every mini-batch
    */
-  void InferShape(const ScopePtr& scope) const override {
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {
     for (auto& op : ops_) {
       op->InferShape(scope);
     }
@@ -70,7 +70,7 @@ class PlainNet : public Net {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  void Run(const ScopePtr& scope,
+  void Run(const std::shared_ptr<Scope>& scope,
            const platform::DeviceContext& dev_ctx) const override {
     for (auto& op : ops_) {
       op->Run(scope, dev_ctx);
@@ -80,7 +80,7 @@ class PlainNet : public Net {
   /**
    * @brief Add an operator by ptr
    */
-  void AddOp(const OperatorPtr& op) override {
+  void AddOp(const std::shared_ptr<OperatorBase>& op) override {
     PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
     ops_.push_back(op);
   }
@@ -89,7 +89,7 @@ class PlainNet : public Net {
 
   std::string DebugString() const override;
 
-  std::vector<OperatorPtr> ops_;
+  std::vector<std::shared_ptr<OperatorBase>> ops_;
 
  private:
   bool add_op_done_{false};
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index e814a7e43d..c179042c81 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -10,10 +10,10 @@ static int run_cnt = 0;
 
 class TestOp : public pd::OperatorBase {
  public:
-  void InferShape(const paddle::framework::ScopePtr& scope) const override {
+  void InferShape(const std::shared_ptr<pd::Scope>& scope) const override {
     ++infer_shape_cnt;
   }
-  void Run(const paddle::framework::ScopePtr& scope,
+  void Run(const std::shared_ptr<pd::Scope>& scope,
            const paddle::platform::DeviceContext& dev_ctx) const override {
     ++run_cnt;
   }
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index c41fe10729..165a68c1cf 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -227,10 +227,10 @@ class OpRegistry {
     }
   }
 
-  static OperatorPtr CreateOp(const std::string& type,
-                              const VarNameList& inputs,
-                              const VarNameList& outputs,
-                              const AttributeMap& attrs) {
+  static std::shared_ptr<OperatorBase> CreateOp(const std::string& type,
+                                                const VarNameList& inputs,
+                                                const VarNameList& outputs,
+                                                const AttributeMap& attrs) {
     auto op_create_it = creators().find(type);
     PADDLE_ENFORCE(op_create_it != creators().end(),
                    "Operator %s cannot be found", type);
@@ -252,10 +252,10 @@ class OpRegistry {
     }
 
     op->Init();
-    return OperatorPtr(op);
+    return std::shared_ptr<OperatorBase>(op);
   }
 
-  static OperatorPtr CreateOp(const OpDesc& op_desc) {
+  static std::shared_ptr<OperatorBase> CreateOp(const OpDesc& op_desc) {
     std::vector<std::string> inputs;
     inputs.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 32a7e88a89..05095372d8 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -7,9 +7,9 @@ namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
  public:
-  void Run(const ScopePtr& scope,
+  void Run(const std::shared_ptr<Scope>& scope,
            const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const ScopePtr& scope) const override {}
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -27,8 +27,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 class MyTestOp : public OperatorBase {
  public:
-  void InferShape(const ScopePtr& scope) const override {}
-  void Run(const ScopePtr& scope,
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
            const platform::DeviceContext& dev_ctx) const override {}
 };
 
@@ -67,7 +67,7 @@ TEST(OpRegistry, CreateOp) {
   attr->set_type(paddle::framework::AttrType::FLOAT);
   attr->set_f(scale);
 
-  paddle::framework::OperatorPtr op =
+  std::shared_ptr<paddle::framework::OperatorBase> op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<paddle::framework::Scope>();
   paddle::platform::CPUDeviceContext dev_ctx;
@@ -89,8 +89,7 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OperatorPtr op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
+    paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (std::runtime_error& err) {
     caught = true;
     std::string msg = "larger_than check fail";
@@ -110,7 +109,7 @@ TEST(OpRegistry, DefaultValue) {
 
   ASSERT_TRUE(op_desc.IsInitialized());
 
-  paddle::framework::OperatorPtr op =
+  std::shared_ptr<paddle::framework::OperatorBase> op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<paddle::framework::Scope>();
   paddle::platform::CPUDeviceContext dev_ctx;
@@ -136,8 +135,7 @@ TEST(OpRegistry, CustomChecker) {
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OperatorPtr op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
+    paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (std::runtime_error& err) {
     caught = true;
     std::string msg = "Attribute 'test_attr' is required!";
@@ -155,8 +153,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OperatorPtr op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
+    paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (std::runtime_error& err) {
     caught = true;
     std::string msg = "'test_attr' must be even!";
@@ -174,8 +171,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
   SetInputFormat(&op_desc);
-  paddle::framework::OperatorPtr op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;
   auto scope = std::make_shared<paddle::framework::Scope>();
   op->Run(scope, dev_ctx);
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 5f046d6293..6b8dbb39ac 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -47,7 +47,6 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 #endif
 
 class OperatorBase;
-using OperatorPtr = std::shared_ptr<OperatorBase>;
 /**
  * OperatorBase has the basic element that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -80,10 +79,10 @@ class OperatorBase {
 
   /// InferShape infer the size of Variables used by this Operator with
   /// information inside scope
-  virtual void InferShape(const ScopePtr& scope) const = 0;
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
 
   /// Net will call this function to Run an op.
-  virtual void Run(const ScopePtr& scope,
+  virtual void Run(const std::shared_ptr<Scope>& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
   // Get a input with argument's name described in `op_proto`
@@ -208,7 +207,7 @@ class OperatorWithKernel : public OperatorBase {
   using OpKernelMap =
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
-  void Run(const ScopePtr& scope,
+  void Run(const std::shared_ptr<Scope>& scope,
            const platform::DeviceContext& dev_ctx) const final {
     auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
     opKernel->Compute(KernelContext(this, scope, dev_ctx));
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 8e55d0111f..3fae356c3e 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -24,8 +24,8 @@ static int op_run_num = 0;
 class OpWithoutKernelTest : public OperatorBase {
  public:
   void Init() override { x = 1; }
-  void InferShape(const ScopePtr& scope) const override {}
-  void Run(const ScopePtr& scope,
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
            const platform::DeviceContext& dev_ctx) const override {
     op_run_num++;
     ASSERT_EQ((int)inputs_.size(), 1);
@@ -70,8 +70,7 @@ TEST(OperatorBase, all) {
   paddle::platform::CPUDeviceContext device_context;
   auto scope = std::make_shared<paddle::framework::Scope>();
 
-  paddle::framework::OperatorPtr op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   scope->CreateVariable("OUT1");
   ASSERT_EQ(paddle::framework::op_run_num, 0);
   op->Run(scope, device_context);
@@ -189,8 +188,7 @@ TEST(OpKernel, all) {
   paddle::platform::CPUDeviceContext cpu_device_context;
   auto scope = std::make_shared<paddle::framework::Scope>();
 
-  paddle::framework::OperatorPtr op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
   op->Run(scope, cpu_device_context);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
@@ -236,6 +234,6 @@ TEST(OpKernel, multi_inputs) {
   paddle::platform::CPUDeviceContext cpu_device_context;
   auto scope = std::make_shared<Scope>();
 
-  OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_device_context);
 }
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index ec62c9189f..79c9ffd1a6 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -24,7 +24,6 @@ namespace paddle {
 namespace framework {
 
 class Scope;
-using ScopePtr = std::shared_ptr<Scope>;
 
 /**
  * @brief Scope that manage all variables.
@@ -44,7 +43,7 @@ class Scope {
   /**
    * @brief Initialize a Scope with parent.
    */
-  explicit Scope(const ScopePtr& parent) : parent_(parent) {}
+  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
 
   /**
    * @brief Create Variable
@@ -91,7 +90,7 @@ class Scope {
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  ScopePtr parent_{nullptr};
+  std::shared_ptr<Scope> parent_{nullptr};
 };
 
 }  // namespace framework
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 2c843839ce..d48a948d21 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -126,9 +126,10 @@ All parameter, weight, gradient are variables in Paddle.
         return new paddle::platform::CPUDeviceContext();
       });
 
-  py::class_<pd::OperatorBase, pd::OperatorPtr> operator_base(m, "Operator");
+  py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
+      m, "Operator");
 
-  operator_base.def_static("create", [](py::bytes protobin) -> pd::OperatorPtr {
+  operator_base.def_static("create", [](py::bytes protobin) {
     pd::OpDesc desc;
     PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                    "Cannot parse user input to OpDesc");

From 8d3e065af851f1bf075f47c181b4e3cb925695d6 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 24 Jul 2017 14:59:22 +0800
Subject: [PATCH 086/100] "update faq"

---
 doc/faq/index_cn.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index c14160d55e..e74236aed7 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -311,3 +311,10 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
 
 主要的解决办法是减小学习律或者对数据进行归一化处理。
+
+15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
+------------------------------------------------------------------------
+先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
+
+pip uninstall py_paddle paddle
+

From 55fe0120e9c04e57ad77d0717c264bc7af8f55f2 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 24 Jul 2017 15:47:54 +0800
Subject: [PATCH 087/100] "update faq "

---
 doc/faq/index_cn.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index e74236aed7..093754a2fa 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -318,3 +318,6 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 
 pip uninstall py_paddle paddle
 
+然后安装paddle的python环境
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl

From 13b5e68ccc3e2b1dae2e05a9ddaa9223b0807db8 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 24 Jul 2017 16:06:35 +0800
Subject: [PATCH 088/100] "add build directory"

---
 doc/faq/index_cn.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 093754a2fa..138efb566e 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -318,6 +318,6 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 
 pip uninstall py_paddle paddle
 
-然后安装paddle的python环境
+然后安装paddle的python环境, 在build目录下执行
 
 pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl

From f85ccdd3518c6d2d296c390e159ac302b837cc09 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 24 Jul 2017 16:48:32 +0800
Subject: [PATCH 089/100] Renew CMakeList dependence

---
 paddle/framework/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 36da6f649b..31b5ccdeae 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,10 +19,10 @@ cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
-cc_library(grad_op_creator SRCS grad_op_creator.cc)
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc grad_op_creator)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
-cc_test(grad_op_creator_test SRCS grad_op_creator_test.cc DEPS grad_op_creator op_registry operator add_op)
+cc_library(grad_op_creator SRCS grad_op_creator.cc DEPS op_proto operator op_registry)
+cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_creator)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+cc_test(grad_op_creator_test SRCS grad_op_creator_test.cc DEPS grad_op_creator add_op)
 
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.

From 380227b58aad44a6c48e72c3d4dd099833ec4f5f Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 24 Jul 2017 09:19:08 +0000
Subject: [PATCH 090/100] Renew CMakeList dependence

---
 paddle/framework/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 31b5ccdeae..a76a95644d 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,10 +19,10 @@ cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
-cc_library(grad_op_creator SRCS grad_op_creator.cc DEPS op_proto operator op_registry)
+cc_library(grad_op_creator SRCS grad_op_creator.cc DEPS op_proto operator)
 cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_creator)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_creator_test SRCS grad_op_creator_test.cc DEPS grad_op_creator add_op)
+cc_test(grad_op_creator_test SRCS grad_op_creator_test.cc DEPS grad_op_creator op_registry add_op)
 
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.

From f4e25550cd0d8900d8cb15dc1c1568f39f33047a Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 24 Jul 2017 18:04:36 +0800
Subject: [PATCH 091/100] Fix compile error

Replace `OperatorPtr` with `std::shared_ptr<OperatorBase>`
---
 paddle/framework/grad_op_creator_test.cc | 5 +++--
 paddle/framework/op_registry.h           | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/framework/grad_op_creator_test.cc b/paddle/framework/grad_op_creator_test.cc
index ad836727c3..27ac658131 100644
--- a/paddle/framework/grad_op_creator_test.cc
+++ b/paddle/framework/grad_op_creator_test.cc
@@ -9,8 +9,9 @@ namespace paddle {
 namespace framework {
 
 TEST(GradOpCreator, AddTwo) {
-  OperatorPtr add_op(OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
-  OperatorPtr grad_add_op = OpRegistry::CreateGradOp(add_op);
+  std::shared_ptr<OperatorBase> add_op(
+      OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
+  std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(add_op);
   EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
   EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
   EXPECT_EQ(grad_add_op->Input("X"), "x");
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 5c8b86c8a9..41c7830932 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -298,9 +298,10 @@ class OpRegistry {
     return CreateOp(op_desc.type(), inputs, outputs, attrs);
   }
 
-  static OperatorPtr CreateGradOp(OperatorPtr op) {
+  static std::shared_ptr<OperatorBase> CreateGradOp(
+      std::shared_ptr<OperatorBase> op) {
     GradOpCreator creator(op.get());
-    OperatorPtr grad_op(creator.Create());
+    std::shared_ptr<OperatorBase> grad_op(creator.Create());
     grad_op->Init();
     return grad_op;
   }

From 81df39fe2af22ed7842c7d5cdf5f1d063aad7631 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 24 Jul 2017 18:58:38 +0800
Subject: [PATCH 092/100] fix compile errer

---
 paddle/framework/net_op_test.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index ccdfe19065..20b42cbb49 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -16,10 +16,11 @@ static int run_cnt = 0;
 
 class TestOp : public OperatorBase {
  public:
-  void InferShape(const std::shared_ptr<pd::Scope>& scope) const override {
+  void InferShape(
+      const std::shared_ptr<framework::Scope>& scope) const override {
     ++infer_shape_cnt;
   }
-  void Run(const std::shared_ptr<pd::Scope>& scope,
+  void Run(const std::shared_ptr<framework::Scope>& scope,
            const paddle::platform::DeviceContext& dev_ctx) const override {
     ++run_cnt;
   }

From 77af58f8f73d19329c2703961d7cfc0581839308 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 24 Jul 2017 21:56:37 +0800
Subject: [PATCH 093/100] Change gradient Op registry mechanism

OLD: op_type -> grad_op_creator

NEW: grad_op_type -> grad_op_creator
     op_type -> grad_op_type
---
 ...{grad_op_creator.cc => grad_op_builder.cc} | 17 +++---
 .../{grad_op_creator.h => grad_op_builder.h}  |  6 +-
 paddle/framework/op_registry.h                | 55 ++++++++++---------
 paddle/operators/add_op.cc                    |  2 +-
 4 files changed, 43 insertions(+), 37 deletions(-)
 rename paddle/framework/{grad_op_creator.cc => grad_op_builder.cc} (88%)
 rename paddle/framework/{grad_op_creator.h => grad_op_builder.h} (92%)

diff --git a/paddle/framework/grad_op_creator.cc b/paddle/framework/grad_op_builder.cc
similarity index 88%
rename from paddle/framework/grad_op_creator.cc
rename to paddle/framework/grad_op_builder.cc
index 106c2eae9d..d9ec8a10a5 100644
--- a/paddle/framework/grad_op_creator.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -12,20 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/framework/grad_op_creator.h"
+#include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
 
-OperatorBase* GradOpCreator::Create() {
+OperatorBase* GradOpBuilder::Build() {
   BuildOpInOutArgList();
-  OperatorBase* grad_op = OpRegistry::grad_creators().at(op_->type_)();
+  std::string grad_op_type = OpRegistry::grad_ops().at(op->type_);
+  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
+  grad_op->type_ = grad_op_type;
   CompleteGradOp(grad_op);
   return grad_op;
 }
 
-OpInOutArg* GradOpCreator::BuildArg(const VarProto& var,
+OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var,
                                     const VarIndexMap& var_map,
                                     const std::vector<int>& format,
                                     InOutType type) {
@@ -36,7 +38,7 @@ OpInOutArg* GradOpCreator::BuildArg(const VarProto& var,
                         end_idx);
 }
 
-void GradOpCreator::BuildOpInOutArgList() {
+void GradOpBuilder::BuildOpInOutArgList() {
   const OpProto& op_proto = OpRegistry::protos().at(op_->type_);
   const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_->type_));
   const std::vector<int>& in_format =
@@ -57,7 +59,7 @@ void GradOpCreator::BuildOpInOutArgList() {
   }
 }
 
-void GradOpCreator::AddArgIntoGradOp(const OpInOutArg* arg,
+void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
                                      std::vector<std::string>& in_out,
                                      std::vector<int>& format,
                                      VarIndexMap* varmap, int& idx,
@@ -80,8 +82,7 @@ void GradOpCreator::AddArgIntoGradOp(const OpInOutArg* arg,
   format.push_back(in_out.size());
 }
 
-void GradOpCreator::CompleteGradOp(OperatorBase* grad_op) const {
-  grad_op->type_ = op_->type_ + "@GRAD";  // not necessary
+void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
   grad_op->attrs_ = op_->attrs_;
   grad_op->attrs_.erase("input_format");
   grad_op->attrs_.erase("output_format");
diff --git a/paddle/framework/grad_op_creator.h b/paddle/framework/grad_op_builder.h
similarity index 92%
rename from paddle/framework/grad_op_creator.h
rename to paddle/framework/grad_op_builder.h
index 21b160a73f..2ecf39479b 100644
--- a/paddle/framework/grad_op_creator.h
+++ b/paddle/framework/grad_op_builder.h
@@ -25,12 +25,12 @@ struct OpInOutArg {
   size_t end_idx_;
 };
 
-class GradOpCreator {
+class GradOpBuilder {
   using VarIndexMap = std::unordered_map<std::string, int>;
 
  public:
-  GradOpCreator(const OperatorBase* op) : op_(op) {}
-  OperatorBase* Create();
+  GradOpBuilder(const OperatorBase* op) : op_(op) {}
+  OperatorBase* Build();
 
  private:
   OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 41c7830932..31a4151851 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -222,7 +222,7 @@ class OpRegistry {
  public:
   template <typename OpType, typename ProtoMakerType>
   static void RegisterOp(const std::string& op_type) {
-    creators()[op_type] = [] { return new OpType; };
+    op_creators()[op_type] = [] { return new OpType; };
     OpAttrChecker& op_checker = op_checkers()[op_type];
     OpProto& op_proto = protos()[op_type];
     auto maker = ProtoMakerType(&op_proto, &op_checker);
@@ -245,17 +245,19 @@ class OpRegistry {
     }
   }
 
-  template <typename OpType>
-  static void RegisterGradOp(const std::string& op_type) {
-    grad_creators()[op_type] = [] { return new OpType; };
+  template <typename GradOpType>
+  static void RegisterGradOp(const std::string& op_type,
+                             const std::string& grad_op_type) {
+    op_creators()[grad_op_type] = [] { return new GradOpType; };
+    grad_ops()[op_type] = grad_op_type;
   }
 
   static std::shared_ptr<OperatorBase> CreateOp(const std::string& type,
                                                 const VarNameList& inputs,
                                                 const VarNameList& outputs,
                                                 const AttributeMap& attrs) {
-    auto op_create_it = creators().find(type);
-    PADDLE_ENFORCE(op_create_it != creators().end(),
+    auto op_create_it = op_creators().find(type);
+    PADDLE_ENFORCE(op_create_it != op_creators().end(),
                    "Operator %s cannot be found.", type);
 
     auto op = op_create_it->second();
@@ -300,8 +302,8 @@ class OpRegistry {
 
   static std::shared_ptr<OperatorBase> CreateGradOp(
       std::shared_ptr<OperatorBase> op) {
-    GradOpCreator creator(op.get());
-    std::shared_ptr<OperatorBase> grad_op(creator.Create());
+    GradOpBuilder builder(op.get());
+    std::shared_ptr<OperatorBase> grad_op(builder.Build());
     grad_op->Init();
     return grad_op;
   }
@@ -311,9 +313,9 @@ class OpRegistry {
     return protos_;
   };
 
-  static std::unordered_map<std::string, OpCreator>& grad_creators() {
-    static std::unordered_map<std::string, OpCreator> grad_creators_;
-    return grad_creators_;
+  static std::unordered_map<std::string, std::string>& grad_ops() {
+    static std::unordered_map<std::string, std::string> grad_ops_;
+    return grad_ops_;
   }
 
   static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
@@ -322,12 +324,12 @@ class OpRegistry {
     return maps_;
   }
 
- private:
-  static std::unordered_map<std::string, OpCreator>& creators() {
-    static std::unordered_map<std::string, OpCreator> creators_;
-    return creators_;
+  static std::unordered_map<std::string, OpCreator>& op_creators() {
+    static std::unordered_map<std::string, OpCreator> op_creators_;
+    return op_creators_;
   }
 
+ private:
   static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
     static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
     return op_checkers_;
@@ -353,11 +355,11 @@ class OpRegisterHelper {
   }
 };
 
-template <typename OpType>
+template <typename GradOpType>
 class GradOpRegisterHelper {
  public:
-  GradOpRegisterHelper(const char* op_type) {
-    OpRegistry::RegisterGradOp<OpType>(op_type);
+  GradOpRegisterHelper(const char* op_type, const char* grad_op_type) {
+    OpRegistry::RegisterGradOp<GradOpType>(op_type, grad_op_type);
   }
 };
 
@@ -383,13 +385,16 @@ class GradOpRegisterHelper {
 /**
  * Macro to Register Gradient Operator.
  */
-#define REGISTER_GRADIENT_OP(__op_type, __op_class)            \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
-      __reg_gradient_op__##__op_type,                          \
-      "REGISTER_GRADIENT_OP must be in global namespace");     \
-  static ::paddle::framework::GradOpRegisterHelper<__op_class> \
-      __op_gradient_register_##__op_type##__(#__op_type);      \
-  int __op_gradient_register_##__op_type##_handle__() { return 0; }
+#define REGISTER_GRADIENT_OP(__op_type, __grad_op_type, __grad_op_class)       \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
+      __reg_gradient_op__##__op_type##__grad_op_type,                          \
+      "REGISTER_GRADIENT_OP must be in global namespace");                     \
+  static ::paddle::framework::GradOpRegisterHelper<__grad_op_class>            \
+      __op_gradient_register_##__op_type##__grad_op_type##__(#__op_type,       \
+                                                             #__grad_op_type); \
+  int __op_gradient_register_##__op_type##__grad_op_type##_handle__() {        \
+    return 0;                                                                  \
+  }
 
 /**
  * Macro to Register OperatorKernel.
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index ff60f9b314..8d415fbd2e 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -65,6 +65,6 @@ protected:
 }  // namespace paddle
 
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
-REGISTER_GRADIENT_OP(add_two, paddle::operators::AddOpGrad);
+REGISTER_GRADIENT_OP(add_two, add_two_grad, paddle::operators::AddOpGrad);
 REGISTER_OP_CPU_KERNEL(
     add_two, paddle::operators::AddKernel<paddle::platform::CPUPlace, float>);

From 197c82f30faf0f24c958a8b0ca902e9d08582194 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 24 Jul 2017 22:02:45 +0800
Subject: [PATCH 094/100] Rewrite grad_op registry macro of mul, sigmoid and
 softmax

---
 paddle/operators/mul_op.cc     | 2 +-
 paddle/operators/sigmoid_op.cc | 2 +-
 paddle/operators/softmax_op.cc | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 89e0375a7a..cd74c8b976 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -67,7 +67,7 @@ protected:
 }  // namespace paddle
 
 REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
-REGISTER_GRADIENT_OP(mul, paddle::operators::MulOpGrad);
+REGISTER_GRADIENT_OP(mul, mul_grad, paddle::operators::MulOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     mul, paddle::operators::MulKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 7dc58bbb10..bf63af28b0 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -56,7 +56,7 @@ protected:
 REGISTER_OP(sigmoid,
             paddle::operators::SigmoidOp,
             paddle::operators::SigmoidOpMaker);
-REGISTER_GRADIENT_OP(sigmoid, paddle::operators::SigmoidOpGrad);
+REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, paddle::operators::SigmoidOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     sigmoid,
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 1d10a415d0..82f72fa19f 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -59,6 +59,6 @@ protected:
 namespace ops = paddle::operators;
 
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
-REGISTER_GRADIENT_OP(softmax, paddle::operators::SoftmaxOpGrad);
+REGISTER_GRADIENT_OP(softmax, softmax_grad, paddle::operators::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax,
                        ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);

From ae6009dfefc95342e63e2ebc63be93d38ae05550 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 24 Jul 2017 22:28:52 +0800
Subject: [PATCH 095/100] fix compile error

---
 paddle/framework/CMakeLists.txt                             | 6 +++---
 paddle/framework/grad_op_builder.cc                         | 2 +-
 .../{grad_op_creator_test.cc => grad_op_builder_test.cc}    | 4 ++--
 paddle/framework/op_registry.h                              | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)
 rename paddle/framework/{grad_op_creator_test.cc => grad_op_builder_test.cc} (89%)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index a76a95644d..433edbfda7 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,10 +19,10 @@ cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
-cc_library(grad_op_creator SRCS grad_op_creator.cc DEPS op_proto operator)
-cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_creator)
+cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
+cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_creator_test SRCS grad_op_creator_test.cc DEPS grad_op_creator op_registry add_op)
+cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
 
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
index d9ec8a10a5..6235be75f2 100644
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -20,7 +20,7 @@ namespace framework {
 
 OperatorBase* GradOpBuilder::Build() {
   BuildOpInOutArgList();
-  std::string grad_op_type = OpRegistry::grad_ops().at(op->type_);
+  std::string grad_op_type = OpRegistry::grad_ops().at(op_->type_);
   OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
   grad_op->type_ = grad_op_type;
   CompleteGradOp(grad_op);
diff --git a/paddle/framework/grad_op_creator_test.cc b/paddle/framework/grad_op_builder_test.cc
similarity index 89%
rename from paddle/framework/grad_op_creator_test.cc
rename to paddle/framework/grad_op_builder_test.cc
index 27ac658131..288a7841cd 100644
--- a/paddle/framework/grad_op_creator_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -1,4 +1,4 @@
-#include "paddle/framework/grad_op_creator.h"
+#include "paddle/framework/grad_op_builder.h"
 #include <gtest/gtest.h>
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -8,7 +8,7 @@ USE_OP(add_two);
 namespace paddle {
 namespace framework {
 
-TEST(GradOpCreator, AddTwo) {
+TEST(GradOpBuilder, AddTwo) {
   std::shared_ptr<OperatorBase> add_op(
       OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
   std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(add_op);
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 31a4151851..f16deae028 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include "paddle/framework/attr_checker.h"
-#include "paddle/framework/grad_op_creator.h"
+#include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/scope.h"
 

From 3ff0a9fbb1ddeb0926f90254b5acaca0c9e6e34f Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 19 Jul 2017 23:46:27 +0000
Subject: [PATCH 096/100] Implement distributed training save model, improve
 master.NewClient interface

---
 doc/design/cluster_train/save_model.md  |  9 +--
 go/master/c/client.go                   | 61 +++++++++++-----
 go/master/client.go                     | 94 ++++++++++++++++++++++---
 go/master/client_test.go                |  8 ++-
 go/master/etcd_client.go                |  8 +--
 go/master/service.go                    | 54 ++++++++++++--
 go/pserver/client/c/cclient.go          | 21 ++----
 go/pserver/client/c/test/test_cclient.c |  4 --
 go/pserver/client/client.go             | 26 -------
 go/pserver/service.go                   |  6 +-
 python/paddle/v2/__init__.py            |  2 +
 python/paddle/v2/master/client.py       | 35 +++++++--
 python/paddle/v2/model.py               | 73 +++++++++++++++++++
 python/paddle/v2/reader/creator.py      | 12 ++--
 14 files changed, 308 insertions(+), 105 deletions(-)
 create mode 100644 python/paddle/v2/model.py

diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md
index b70f00176b..b755185c81 100644
--- a/doc/design/cluster_train/save_model.md
+++ b/doc/design/cluster_train/save_model.md
@@ -75,10 +75,11 @@ snapshot to a model will be a TODO for future.
 ### Trainer Election
 
 One trainer will be elected as the one to save the model. When using
-etcd, trainer ID is a randomly generated UUID, we will utilize etcd to
-elect one trainer. When not using etcd, unique trainer IDs will be
-given by the administrator, the trainer whose ID is "0" is elected to
-save the model.
+etcd, trainer ID is a randomly generated UUID, the trainer will
+contact the master server requesting to save the model, and find out
+if itself is elected. When the master server is not used, unique
+trainer IDs will be given by the administrator, the trainer whose ID
+is "0" is elected to save the model.
 
 ### Model Save Path
 
diff --git a/go/master/c/client.go b/go/master/c/client.go
index 9f5733075f..6d329937f0 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -33,7 +33,6 @@ import (
 	"unsafe"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )
 
@@ -65,32 +64,32 @@ func remove(client C.paddle_master_client) *master.Client {
 }
 
 //export paddle_new_etcd_master_client
+//
+// bufSize is the record buffer size.
 func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
 	p := C.GoString(etcdEndpoints)
-	cli, err := clientv3.New(clientv3.Config{
-		Endpoints:   strings.Split(p, ","),
-		DialTimeout: time.Second * time.Duration(timeout),
-	})
+	endpoints := strings.Split(p, ",")
+	c, err := master.NewClient(
+		master.WithEtcd(endpoints, time.Duration(timeout)*time.Second),
+		master.WithBuffer(bufSize),
+	)
 	if err != nil {
 		panic(err)
 	}
-	ch := make(chan string, 1)
-	a, err := master.GetKey(cli, master.DefaultAddrPath, timeout)
-	if err != nil {
-		panic(err)
-	}
-	ch <- a
-	go master.WatchKey(cli, master.DefaultAddrPath, ch)
-	c := master.NewClient(ch, bufSize)
+
 	return add(c)
 }
 
 //export paddle_new_master_client
+//
+// bufSize is the record buffer size.
 func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
 	a := C.GoString(addr)
-	ch := make(chan string, 1)
-	ch <- a
-	c := master.NewClient(ch, bufSize)
+	c, err := master.NewClient(master.WithAddr(a), master.WithBuffer(bufSize))
+	if err != nil {
+		panic(err)
+	}
+
 	return add(c)
 }
 
@@ -117,9 +116,10 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	return C.PADDLE_MASTER_OK
 }
 
-// return value:
-//     0:ok
-//    -1:error
+// paddle_next_record gets the nexts training record.
+//
+// returns number of bytes of the records if success, -1 if failed.
+//
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
@@ -143,6 +143,29 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	return C.int(size)
 }
 
+// paddle_request_save_model requests the master server to approve the
+// caller to save the model.
+//
+// returns 1 if the save the model request is approved, 0 if does the
+// request is rejected because other trainer is saving the model, -1
+// if error happened.
+//
+//export paddle_request_save_model
+func paddle_request_save_model(client C.paddle_master_client, trainerID string, blockMS int) C.int {
+	c := get(client)
+	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
+	if err != nil {
+		log.Errorln(err)
+		return -1
+	}
+
+	if need {
+		return 1
+	}
+
+	return 0
+}
+
 //export mem_free
 func mem_free(p unsafe.Pointer) {
 	// "free" may be a better name for this function, but doing so
diff --git a/go/master/client.go b/go/master/client.go
index 7f33090dc7..bbf3768d96 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -16,17 +16,20 @@ package master
 
 import (
 	"os"
+	"sync"
 	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
+	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )
 
 // Client is the client of the master server.
 type Client struct {
-	conn *connection.Conn
-	ch   chan record
+	conn       *connection.Conn
+	ch         chan record
+	initChOnce sync.Once
 }
 
 type record struct {
@@ -34,24 +37,83 @@ type record struct {
 	err error
 }
 
-// NewClient creates a new Client.
+// WithBuffer sets the client to buffer the training record.
 //
 // bufSize is the record buffer size. NextRecord will read from this
 // buffer.
-func NewClient(addrCh <-chan string, bufSize int) *Client {
+func WithBuffer(bufSize int) func(*Client) error {
+	return func(c *Client) error {
+		if bufSize <= 0 {
+			return nil
+		}
+
+		c.initChOnce.Do(func() {
+			c.ch = make(chan record, bufSize)
+			go c.getRecords()
+		})
+		return nil
+	}
+}
+
+// WithAddr sets the client to use fixed master address.
+func WithAddr(addr string) func(c *Client) error {
+	return func(c *Client) error {
+		ch := make(chan string, 1)
+		ch <- addr
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// WithEtcd sets the client to use etcd for master discovery.
+func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
+	return func(c *Client) error {
+		cli, err := clientv3.New(clientv3.Config{
+			Endpoints:   endpoints,
+			DialTimeout: timeout,
+		})
+		if err != nil {
+			return err
+		}
+
+		ch := make(chan string, 1)
+		a, err := GetKey(cli, DefaultAddrPath, timeout)
+		if err != nil {
+			return err
+		}
+
+		if a != "" {
+			// Master is registered, send to the master address
+			// channel.
+			ch <- a
+		}
+
+		go watchKey(cli, DefaultAddrPath, ch)
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// NewClient creates a new Client.
+func NewClient(opts ...func(*Client) error) (*Client, error) {
 	c := &Client{}
 	c.conn = connection.New()
-	c.ch = make(chan record, bufSize)
-	go c.monitorMaster(addrCh)
-	go c.getRecords()
-	return c
+
+	for _, opt := range opts {
+		err := opt(c)
+		if err != nil {
+			return nil, err
+		}
+
+	}
+
+	return c, nil
 }
 
 func (c *Client) getRecords() {
 	for {
 		t, err := c.getTask()
 		if err != nil {
-			// getTask call.
 			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
 			time.Sleep(3 * time.Second)
 			continue
@@ -146,6 +208,20 @@ func (c *Client) taskFailed(meta TaskMeta) error {
 // NextRecord will block until the next record is available. It is
 // thread-safe.
 func (c *Client) NextRecord() ([]byte, error) {
+	c.initChOnce.Do(func() {
+		// initialize with in case WithBuffer is not used.
+		c.ch = make(chan record, 0)
+		go c.getRecords()
+	})
+
 	r := <-c.ch
 	return r.r, r.err
 }
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
+	var need bool
+	err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
+	return need, err
+}
diff --git a/go/master/client_test.go b/go/master/client_test.go
index a90062c753..a3a434ae7e 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -87,9 +87,11 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 
-	curAddr := make(chan string, 1)
-	curAddr <- fmt.Sprintf(":%d", p)
-	c := master.NewClient(curAddr, 10)
+	c, err := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(10))
+	if err != nil {
+		panic(err)
+	}
+
 	err = c.SetDataset([]string{path})
 	if err != nil {
 		panic(err)
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index 607e726251..ae6b6f776b 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -158,8 +158,8 @@ func (e *EtcdClient) Load() ([]byte, error) {
 }
 
 // GetKey gets the value by the specify key.
-func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	resp, err := c.Get(ctx, key)
 	cancel()
 	if err != nil {
@@ -173,8 +173,8 @@ func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
 	return string(v), nil
 }
 
-// WatchKey watches the specify key and send to valChan if there is some event.
-func WatchKey(c *clientv3.Client, key string, valChan chan<- string) {
+// watchKey watches the specify key and send to valChan if there is some event.
+func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
 	rch := c.Watch(context.Background(), key)
 	for wresp := range rch {
 		for _, ev := range wresp.Events {
diff --git a/go/master/service.go b/go/master/service.go
index 2766720c28..d1ec8939e1 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -78,9 +78,10 @@ type Service struct {
 	ready         chan struct{}
 	store         Store
 
-	mu         sync.Mutex
-	initDone   bool
-	taskQueues taskQueues
+	mu            sync.Mutex
+	initDone      bool
+	taskQueues    taskQueues
+	savingTrainer string
 }
 
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
@@ -246,7 +247,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 //
 // SetDataset can be call multiple times. But only the first call will
 // be honored.
-func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+func (s *Service) SetDataset(globPaths []string, _ *int) error {
 	if len(globPaths) == 0 {
 		return errors.New("no dataset specified")
 	}
@@ -330,7 +331,7 @@ func (s *Service) logFields() log.Fields {
 }
 
 // GetTask gets a new task from the service.
-func (s *Service) GetTask(dummy int, task *Task) error {
+func (s *Service) GetTask(_ int, task *Task) error {
 	select {
 	case <-s.ready:
 	}
@@ -380,7 +381,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 }
 
 // TaskFinished tell the service that a task is finished.
-func (s *Service) TaskFinished(taskID int, dummy *int) error {
+func (s *Service) TaskFinished(taskID int, _ *int) error {
 	select {
 	case <-s.ready:
 	}
@@ -415,7 +416,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 }
 
 // TaskFailed tells the service that a task is failed.
-func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
+func (s *Service) TaskFailed(meta TaskMeta, _ *int) error {
 	select {
 	case <-s.ready:
 	}
@@ -432,3 +433,42 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 	s.processFailedTask(t, meta.Epoch)
 	return nil
 }
+
+// SaveModelRequest is the request for saving model
+type SaveModelRequest struct {
+	TrainerID string
+	BlockDur  time.Duration
+}
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (s *Service) RequestSaveModel(req SaveModelRequest, need *bool) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if req.TrainerID == "" {
+		return errors.New("trainer id is empty")
+	}
+
+	if s.savingTrainer == "" {
+		*need = true
+	} else {
+		if req.TrainerID == s.savingTrainer {
+			// save trainer asked to save model again
+			*need = true
+		} else {
+			*need = false
+		}
+	}
+
+	if *need {
+		s.savingTrainer = req.TrainerID
+		time.AfterFunc(req.BlockDur, func() {
+			s.mu.Lock()
+			s.savingTrainer = ""
+			s.mu.Unlock()
+		})
+	}
+
+	return nil
+}
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index 24cd922ffe..0f7e20cdd8 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -127,13 +127,19 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }
 
+// paddle_begin_init_params tells trainer if it needs to init the
+// parameters.
+//
+// returns 1 if the trainer needs to init the parameters. 0 if the
+// trainer does not need to init the parameters.
+//
 //export paddle_begin_init_params
 func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return C.PSERVER_OK
+	return 0
 }
 
 //export paddle_init_param
@@ -256,17 +262,4 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 	return C.PSERVER_OK
 }
 
-//export paddle_save_model
-func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
-	p := C.GoString(path)
-	c := get(client)
-	err := c.Save(p)
-	if err != nil {
-		log.Errorln(err)
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
 func main() {} // Required but ignored
diff --git a/go/pserver/client/c/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
index f9b9967434..89c4d7f00a 100644
--- a/go/pserver/client/c/test/test_cclient.c
+++ b/go/pserver/client/c/test/test_cclient.c
@@ -111,9 +111,5 @@ retry:
     getParams(c);
   }
 
-  if (paddle_save_model(c, "/tmp/")) {
-    fail();
-  }
-
   return 0;
 }
diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
index ddb749d629..15adda4735 100644
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -219,32 +219,6 @@ func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
 	return ps, nil
 }
 
-// Save indicates parameters to save the parameter to the given path.
-func (c *Client) Save(path string) error {
-	errCh := make(chan error, len(c.pservers))
-
-	for _, p := range c.pservers {
-		err := p.Call("Service.Save", path, nil)
-		errCh <- err
-	}
-
-	recv := 0
-	for err := range errCh {
-		if err != nil {
-			return err
-		}
-
-		recv++
-		if recv == len(c.pservers) {
-			break
-		}
-	}
-
-	// TODO(helin): there will be many files under path, need to
-	// merge them into a single file.
-	return nil
-}
-
 func strHash(s string) uint32 {
 	h := fnv.New32a()
 	_, _ = h.Write([]byte(s))
diff --git a/go/pserver/service.go b/go/pserver/service.go
index a7767afa63..7d297c46d0 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -164,7 +164,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
 }
 
 // InitParam initializes a parameter.
-func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
+func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
 	select {
 	case <-s.initialized:
 		return errors.New(AlreadyInitialized)
@@ -185,7 +185,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 
 // FinishInitParams tells the parameter server that the parameter
 // initialization has finished.
-func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
+func (s *Service) FinishInitParams(_ int, _ *int) error {
 	select {
 	case <-s.initialized:
 		return errors.New(AlreadyInitialized)
@@ -198,7 +198,7 @@ func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 
 // SendGrad sends gradient to parameter servers for parameter
 // optimization.
-func (s *Service) SendGrad(g Gradient, dummy *int) error {
+func (s *Service) SendGrad(g Gradient, _ *int) error {
 	select {
 	case <-s.initialized:
 	default:
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 3c75ca4c3a..07ab2c9b18 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -33,6 +33,7 @@ import networks
 import minibatch
 import plot
 import image
+import model
 
 __all__ = [
     'optimizer',
@@ -54,6 +55,7 @@ __all__ = [
     'evaluator',
     'image',
     'master',
+    'model',
 ]
 
 
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 4c041fb509..4dc31bff58 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -10,11 +10,31 @@ class client(object):
     client is a client to the master server.
     """
 
-    def __init__(self, etcd_endpoints, timeout, buf_size):
-        self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout,
+    def __init__(self, etcd_endpoints, timeout_sec, buf_size=0):
+        self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout_sec,
                                                    buf_size)
 
-    def close(self):
+    def request_save_model(self, trainer_id, block_ms):
+        """request to save model
+
+        Conventionally the 0-th trainer will save model. But in
+        distributed training, any trainer could be killed. This
+        function asks the master server if the trainer should proceed
+        with saving model.
+
+        :param trainer_id: trainer id.
+        :param block_ms: number of millisecond that other save model
+        will be blocked if this save model request succeeded.
+
+        Returns:
+            int: 1 if the save the model request is approved, 0 if
+            does the request is rejected because other trainer is
+            saving the model, -1 if error happened.
+
+        """
+        return lib.paddle_request_save_model(self.c, trainer_id, block_ms)
+
+    def release(self):
         lib.paddle_release_master_client(self.c)
         self.c = None
 
@@ -27,10 +47,13 @@ class client(object):
             holder[idx] = c_ptr
         lib.paddle_set_dataset(self.c, holder, len(paths))
 
-    # return format: (record, errno)
-    # errno =  0: ok
-    #       <  0: error
     def next_record(self):
+        """gets next record for training
+
+        Returns:
+            string: the record.
+            int: error code, 0 if successful, < 0 otherwise.
+        """
         p = ctypes.c_char_p()
         ret = ctypes.pointer(p)
         size = lib.paddle_next_record(self.c, ret)
diff --git a/python/paddle/v2/model.py b/python/paddle/v2/model.py
new file mode 100644
index 0000000000..20c3282098
--- /dev/null
+++ b/python/paddle/v2/model.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import errno
+import uuid
+
+import paddle.v2.master
+
+__all__ = ["save_model", "load_model"]
+
+trainer_id = str(uuid.uuid4())
+
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+
+def save_model(parameters, path):
+    need_request = "KUBERNETES_SERVICE_HOST" in os.environ.keys()
+
+    if need_request:
+        # TODO(helin): figure out how MPI trains, since MPI only save
+        # model when trainer_id == "0", we can consolidate the logic
+        # here.
+
+        # TODO(helin): change this environment variable name from
+        # MASTER_IP to ETCD_IP
+        etcd_name = "MASTER_IP"
+        if etcd_name not in os.environ.keys():
+            raise Exception('not find ' + etcd_name +
+                            ' in environment variable.')
+
+        etcd_ip = os.environ.get(etcd_name)
+        client = master.client("http://" + etcd_ip + ":2379", 5, 0)
+        r = client.request_save_model(trainer_id, 5000)
+        if r == 0:
+            # do not need to save
+            return
+        elif r < 0:
+            # error
+            return
+        else:
+            # save model
+            path = os.path.join(path, trainer_id)
+            path = os.path.join(path, "model.tar")
+
+    mkdir_p(path)
+
+    with open(path, 'wb') as f:
+        parameters.to_tar(f)
+
+
+def load_model(parameters, path):
+    with open(path, 'rb') as f:
+        parameters.from_tar(f)
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 61b5cc134f..55a0fcdf56 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Creator package contains some simple reader creator, which could be used in user
-program.
+Creator package contains some simple reader creator, which could
+be used in user program.
 """
 
 __all__ = ['np_array', 'text_file', "recordio"]
@@ -59,7 +59,7 @@ def text_file(path):
 
 def recordio_local(paths, buf_size=100):
     """
-    Creates a data reader from given RecordIO file paths separated by ",", 
+    Creates a data reader from given RecordIO file paths separated by ",",
         glob pattern is supported.
     :path: path of recordio files.
     :returns: data reader of recordio files.
@@ -83,7 +83,7 @@ def recordio_local(paths, buf_size=100):
 
 def recordio(paths, buf_size=100):
     """
-    Creates a data reader that outputs record one one by one 
+    Creates a data reader that outputs record one one by one
         from given local or cloud recordio path.
     :path: path of recordio files.
     :returns: data reader of recordio files.
@@ -96,7 +96,7 @@ def recordio(paths, buf_size=100):
 
     host_name = "MASTER_SERVICE_HOST"
     if host_name not in os.environ.keys():
-        raise Exception('not find ' + host_name + ' in environ.')
+        raise Exception('not find ' + host_name + ' in environment variable.')
 
     addr = os.environ(host)
 
@@ -110,6 +110,6 @@ def recordio(paths, buf_size=100):
                 break
             yield r
 
-        c.close()
+        c.release()
 
     return reader

From c67d8276b7cf874c56e69a7ffa6f4f4168680634 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Mon, 24 Jul 2017 23:10:37 +0000
Subject: [PATCH 097/100] fix according to comments

---
 go/master/c/client.go | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/go/master/c/client.go b/go/master/c/client.go
index 6d329937f0..a2b18e4b47 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -22,6 +22,9 @@ package main
 #define PADDLE_MASTER_OK    0
 #define PADDLE_MASTER_ERROR -1
 
+#define PADDLE_SAVE_MODEL_OK   1
+#define PADDLE_SAVE_MODEL_SKIP 0
+
 typedef int paddle_master_client;
 */
 import "C"
@@ -146,7 +149,7 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 // paddle_request_save_model requests the master server to approve the
 // caller to save the model.
 //
-// returns 1 if the save the model request is approved, 0 if does the
+// returns 1 if the save the model request is approved, 0 if the
 // request is rejected because other trainer is saving the model, -1
 // if error happened.
 //
@@ -156,14 +159,14 @@ func paddle_request_save_model(client C.paddle_master_client, trainerID string,
 	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
 	if err != nil {
 		log.Errorln(err)
-		return -1
+		return C.PADDLE_MASTER_ERROR
 	}
 
 	if need {
-		return 1
+		return C.PADDLE_SAVE_MODEL_OK
 	}
 
-	return 0
+	return C.PADDLE_SAVE_MODEL_SKIP
 }
 
 //export mem_free

From e8a0e92b5f00c19b066cc4b94adeeecd2ca4cbab Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 25 Jul 2017 10:54:15 +0800
Subject: [PATCH 098/100] Fix compile error

---
 paddle/operators/add_op_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/add_op_test.cc b/paddle/operators/add_op_test.cc
index 7fc1049893..3d52f54983 100644
--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
@@ -22,7 +22,7 @@ TEST(AddOp, GetOpProto) {
   auto& protos = paddle::framework::OpRegistry::protos();
   auto it = protos.find("add_two");
   ASSERT_NE(it, protos.end());
-  auto& grad_creators = paddle::framework::OpRegistry::grad_creators();
-  auto it1 = grad_creators.find("add_two");
-  ASSERT_NE(it1, grad_creators.end());
+  auto& op_creators = paddle::framework::OpRegistry::op_creators();
+  auto it1 = op_creators.find("add_two_grad");
+  ASSERT_NE(it1, op_creators.end());
 }

From c94b275553fc51d2b70bf8ef675cf20c84243d9a Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Tue, 25 Jul 2017 12:30:41 +0800
Subject: [PATCH 099/100] fix conv layer reshape

---
 paddle/gserver/layers/ConvBaseProjection.cpp | 48 ++++++++------------
 paddle/gserver/layers/ConvBaseProjection.h   |  6 ---
 2 files changed, 19 insertions(+), 35 deletions(-)

diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
index d1e932ded5..eb6b0445c9 100644
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -87,9 +87,6 @@ void ConvBaseProjection::initCudnn() {
   bwdDataLimitBytes_ = 0;
   bwdFilterLimitBytes_ = 0;
   workSpaceInBytes_ = 0;
-
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
 }
 
 void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
@@ -142,32 +139,25 @@ void ConvBaseProjection::reshape(int batchSize) {
   CHECK_EQ(width, out_->value->getWidth());
   CHECK_EQ(calInputSize(), in_->value->getWidth());
 
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(imageDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-  }
-
-  isSelectAlgo_ = true;
+  reshapeTensorDesc(batchSize);
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  workSpaceInBytes_ = maxWorkSpace;
+
+  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
 }
 
 void *ConvBaseProjection::getSpaceBytes(size_t size) {
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index 4a33aa1837..e9d9f8f1b2 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -101,12 +101,6 @@ protected:
   size_t bwdFilterLimitBytes_;
   /// Size of total work space.
   size_t workSpaceInBytes_;
-
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
   bool bias_;
 
   std::unique_ptr<Weight> weight_;

From 41e1484eb2c8c15e4a329ff48419404043c55d0c Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 25 Jul 2017 14:18:06 +0800
Subject: [PATCH 100/100] Fix hl_sequence_avg_forward.

---
 paddle/cuda/src/hl_cuda_sequence.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 0fe2877f89..4f650ce03c 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -330,7 +330,7 @@ __global__ void KeSequenceAvgForward(real* dst,
     }
     sum = mode == 1 ? sum :
         (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[gid] = sum;
+    dst[gid] += sum;
   }
 }