Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into reshape_op_dev

8 years ago · 31cbb3432f
parent 7ae72f752d af523df42e
commit 31cbb3432f
40 changed files with 576 additions and 921 deletions
--- a/paddle/gserver/layers/DeConv3DLayer.cpp
+++ b/paddle/gserver/layers/DeConv3DLayer.cpp
@ -53,27 +53,27 @@ bool DeConv3DLayer::init(const LayerMap &layerMap,
 size_t DeConv3DLayer::getSize() {
  CHECK_NE(inputLayers_.size(), 0UL);
-  outputH_.clear();
+  imgSizeW_.clear();
-  outputW_.clear();
+  imgSizeH_.clear();
-  outputD_.clear();
+  imgSizeD_.clear();
  N_.clear();
  NOut_.clear();
  size_t layerSize = 0;
  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    outputW_.push_back(
+    imgSizeW_.push_back(
-        imageSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+        imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    outputH_.push_back(imageSize(
+    imgSizeH_.push_back(imageSize(
-        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+        outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    outputD_.push_back(imageSize(
+    imgSizeD_.push_back(imageSize(
-        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+        outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-    NOut_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
-    N_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
    layerSize += NOut_[i] * numFilters_;
  }
-  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameHeight(imgSizeH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameWidth(imgSizeW_[0]);
-  getOutput().setFrameDepth(outputD_[0]);
+  getOutput().setFrameDepth(imgSizeD_[0]);
  return layerSize;
 }
@ -103,9 +103,9 @@ void DeConv3DLayer::forward(PassType passType) {
      }
      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
                       numFilters_,
-                       outputD_[i],
+                       imgSizeD_[i],
-                       outputH_[i],
+                       imgSizeH_[i],
-                       outputW_[i],
+                       imgSizeW_[i],
                       filterSizeZ_[i],
                       filterSizeY_[i],
                       filterSize_[i],
@ -144,9 +144,9 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
        colBuf_->vol2Col(
            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
            numFilters_,
-            outputD_[i],
+            imgSizeD_[i],
-            outputH_[i],
+            imgSizeH_[i],
-            outputW_[i],
+            imgSizeW_[i],
            filterSizeZ_[i],
            filterSizeY_[i],
            filterSize_[i],
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@ -77,24 +77,6 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 void MKLDNNFcLayer::convertOutputToOtherDevice() {
  copyOutputInfoToOtherDevice();
  // find other cpu device and reorder output to cpu device
  int cnt = 0;
  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
      // fc cpu output value do not need convert
      // just share point
      outputOtherDevice_[i].value = output_.value;
      ++cnt;
    }
  }
  if (cnt > 1) {
    LOG(WARNING) << "should not have more than one CPU devie";
  }
 }
 void MKLDNNFcLayer::reshape() {
  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
  int batchSize = input.getBatchSize();
@ -155,7 +137,10 @@ void MKLDNNFcLayer::resetFwd() {
  // change original output value to mkldnn output value
  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
  if (!outputIsOnlyMKLDNN()) {
-    convertOutputToOtherDevice();
+    copyOutputInfoToOtherDevice();
    // fc cpu output value do not need create convert
    // just share point
    getOutput(CPU_DEVICE).value->setData(output_.value->getData());
  }
  // create forward handle
@ -235,13 +220,12 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdWgt_);
  /// backward data
-  device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  const MatrixPtr& in = inputLayers_[0]->getOutput().grad;
  const MatrixPtr& in = getInputGrad(0, device);
  if (in == nullptr) {
    return;
  }
-  if (getInput(0, device).getAllCount() > 1) {
+  if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
-    // TODO(TJ): use outputMaps_ ways when merge outgrad done
+    // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
  } else {
    inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
  }
@ -258,13 +242,21 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdData_);
 }
 void MKLDNNFcLayer::updateInputData() {
  if (inputLayers_[0]->getType() != "data") {
    return;
  }
  real* iData = getInputValue(0, CPU_DEVICE)->getData();
  inVal_->setData(iData);
 }
 void MKLDNNFcLayer::forward(PassType passType) {
  Layer::forward(passType);
  reshape();
  {
    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-    syncInputValue();
+    updateInputData();
    // just submit forward pipeline
    stream_->submit(pipelineFwd_);
@ -286,7 +278,6 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
    resetBwd();
    syncOutputGrad();
    // just sumbmit backward pipeline
    stream_->submit(pipelineBwd_);
  }
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@ -53,6 +53,8 @@ public:
  void backward(const UpdateCallback& callback) override;
  void updateInputData() override;
 protected:
  /**
   * reshape the input image sizes
@ -72,8 +74,6 @@ protected:
   * only would be called when needed
   */
  void resetBwd();
  void convertOutputToOtherDevice() override;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@ -114,10 +114,10 @@ public:
  virtual void convertWeightsToPaddle() {}
  /**
-   * convert MKLDNN output to other device.
+   * Update input value data when input layer is "data" type.
-   * only support CPU device yet
+   * Since the input value data address might be changed.
   */
-  virtual void convertOutputToOtherDevice() {}
+  virtual void updateInputData() {}
  /**
   * print info about sizes
@ -155,6 +155,7 @@ protected:
   *        copy base info and do not copy data value
   */
  void copyOutputInfoToOtherDevice() {
    int cnt = 0;
    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
      outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
      outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
@ -163,6 +164,12 @@ protected:
      outputOtherDevice_[i].subSequenceStartPositions =
          output_.subSequenceStartPositions;
      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
        ++cnt;
      }
    }
    if (cnt > 1) {
      LOG(WARNING) << "should not have more than one CPU devie";
    }
  }
@ -193,32 +200,6 @@ protected:
    return outputOtherDevice_.size() == 0;
  }
  /**
   * Sync input value data
   */
  void syncInputValue() {
    if (inputIsOnlyMKLDNN()) {
      return;
    }
    real* iData = getInputValue(0, CPU_DEVICE)->getData();
    // update input data
    // since it might be changed if this is after data layer
    inVal_->updateData(iData);
  }
  /**
   * Sync output grad data
   */
  void syncOutputGrad() {
    if (outputIsOnlyMKLDNN()) {
      return;
    }
    // update diff
    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
    outGrad_->updateData(oDiff);
  }
  /**
   * Set deviceId of this layer.
   */
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -2302,26 +2302,27 @@ void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
  conv->set_stride(2);
  conv->set_stride_y(2);
  conv->set_stride_z(2);
-  conv->set_img_size(IMAGE_SIZE);
+  conv->set_output_x(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_y(IMAGE_SIZE_Y);
-  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_z(IMAGE_SIZE_Z);
-  conv->set_output_x(imageSize(conv->img_size(),
+
  conv->set_img_size(imageSize(conv->output_x(),
                               conv->filter_size(),
                               conv->padding(),
                               conv->stride(),
                               true));
-  conv->set_output_y(imageSize(conv->img_size_y(),
+  conv->set_img_size_y(imageSize(conv->output_y(),
-                               conv->filter_size_y(),
+                                 conv->filter_size_y(),
-                               conv->padding_y(),
+                                 conv->padding_y(),
-                               conv->stride_y(),
+                                 conv->stride_y(),
-                               true));
+                                 true));
-  conv->set_output_z(imageSize(conv->img_size_z(),
+  conv->set_img_size_z(imageSize(conv->output_z(),
-                               conv->filter_size_z(),
+                                 conv->filter_size_z(),
-                               conv->padding_z(),
+                                 conv->padding_z(),
-                               conv->stride_z(),
+                                 conv->stride_z(),
-                               true));
+                                 true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
-                              conv->output_z() * NUM_FILTERS);
+                              conv->img_size_z() * NUM_FILTERS);
  conv->set_groups(1);
  conv->set_filter_channels(conv->channels() / conv->groups());
  config.inputDefs.push_back(
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@ -33,14 +33,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
    size_t width = cnts / dims[0];
    m = Matrix::create(height, width, false, false);
  }
  CHECK(m) << " Matrix should not be empty";
  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
-
+  CHECK_EQ(cpuMatrix->getElementCnt(), cnts) << "Count size does not match";
-  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
+  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
  return std::make_shared<MKLDNNMatrix>(
      m->getData(), m->getHeight(), m->getWidth(), pd);
 }
 MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
@ -138,7 +136,7 @@ void MKLDNNMatrix::downSpatial() {
      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
      "could not create a memory primitive");
  reset(result);
-  set_data_handle(getData());
+  set_data_handle(data_);
 }
 }  // namespace paddle
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@ -30,11 +30,10 @@ typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
 */
 class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
 public:
-  MKLDNNMatrix(real* data,
+  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
-               size_t height,
+      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
-               size_t width,
+        mkldnn::memory(pd, m->getData()),
-               mkldnn::memory::primitive_desc pd)
+        m_(m) {}
      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
  ~MKLDNNMatrix() {}
@ -81,11 +80,29 @@ public:
  void downSpatial();
  /**
-   * Update the memory data handle.
+   * set the memory data handle.
   * Caution: This will not check the buffer size of the data,
   *          it should be coverd by user.
   */
-  void updateData(void* data) { set_data_handle(data); }
+  void setData(real* data) {
    set_data_handle(data);
    CpuMatrix::setData(data);
    m_.reset();
  }
  /**
   * override Matrix::getData
   * check data before return
   */
  real* getData() override {
    CHECK_EQ((void*)data_, get_data_handle());
    return data_;
  }
  const real* getData() const override {
    CHECK_EQ((void*)data_, get_data_handle());
    return data_;
  }
  /**
   * Get primitive descriptor.
@ -143,6 +160,10 @@ protected:
                   memory::format srcFmt,
                   memory::format dstFmt,
                   memory::dims dm);
 private:
  // save the CpuMatrixPtr in case the buffer released outside
  CpuMatrixPtr m_;
 };
 }  // namespace paddle
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@ -0,0 +1,79 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/concat_op.h"
 #include <vector>
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 class ConcatOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
    auto *out = ctx.Output<framework::Tensor>("Out");
    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
    size_t n = ins.size();
    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
    auto out_dims = ins[0]->dims();
    size_t in_zero_dims_size = out_dims.size();
    for (size_t i = 1; i < n; i++) {
      for (size_t j = 0; j < in_zero_dims_size; j++) {
        if (j == axis) {
          out_dims[axis] += ins[i]->dims()[j];
          continue;
        }
        PADDLE_ENFORCE_EQ(out_dims[j], ins[i]->dims()[j],
                          "Input tensors should have the same "
                          "elements except the specify axis.")
      }
    }
    out->Resize(out_dims);
  }
 };
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "the input tensors of concat operator.").AsDuplicable();
    AddOutput("Out", "the output tensor of concat operator.");
    AddComment(R"DOC(
            Join the input tensors along with the axis.
            Examples:
              Input[0] = [[1,2],[3,4]]
              Input[1] = [[5,6]]
              axis = 0
              Output = [[1,2],
                        [3,4],
                        [5,6]]
        )DOC");
    AddAttr<int>("axis", "The axis which the inputs will be joined with.")
        .SetDefault(0);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(concat, ops::ConcatOp, ops::ConcatOpMaker)
 REGISTER_OP_CPU_KERNEL(concat,
                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
--- a/paddle/operators/concat_op.cu
+++ b/paddle/operators/concat_op.cu
@ -0,0 +1,19 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/concat_op.h"
 namespace ops = paddle::operators;
 // TODO(Yancey1989) Add GPU kernel
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
@ -0,0 +1,64 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 template <typename Place, typename T>
 class ConcatKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
    auto* out = ctx.Output<framework::Tensor>("Out");
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
    size_t n = ins.size();
    size_t output_axis_dim = 0;
    size_t before = 1, after = 1;
    for (size_t i = 0; i < n; i++) {
      output_axis_dim += ins[i]->dims()[axis];
    }
    auto& input_zero = ins[0];
    for (int64_t i = 0; i < input_zero->dims().size(); i++) {
      if (i == axis) {
        continue;
      }
      if (i < axis) {
        before *= input_zero->dims()[i];
      } else {
        after *= input_zero->dims()[i];
      }
    }
    size_t output_offset = 0;
    for (size_t i = 0; i < n; i++) {
      auto& in = ins[i];
      auto axis_dim = in->dims()[axis];
      for (size_t j = 0; j < before; j++) {
        size_t len = axis_dim * after * sizeof(T);
        const T* src = in->data<T>() + axis_dim * after * j;
        T* out_data = out->mutable_data<T>(platform::CPUPlace());
        T* dest = out_data + output_offset + output_axis_dim * after * j;
        memcpy(dest, src, len);
      }
      output_offset += axis_dim * after;
    }
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@ -49,6 +49,7 @@ USE_OP(minus);
 USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
 USE_CPU_ONLY_OP(concat);
 USE_OP(top_k);
 USE_OP(squared_l2_distance);
 USE_OP(sum);
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@ -43,7 +43,6 @@ class OpDescCreationMethod(object):
        if len(args) != 0:
            raise ValueError("Only keyword arguments are supported.")
        op_desc = framework_pb2.OpDesc()
        for input_parameter in self.__op_proto__.inputs:
            input_arguments = kwargs.get(input_parameter.name, [])
            if is_str(input_arguments):
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@ -19,8 +19,6 @@ py_test(test_scatter_op SRCS test_scatter_op.py)
 py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
 py_test(test_top_k_op SRCS test_top_k_op.py)
 py_test(gradient_checker SRCS gradient_checker.py)
 py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
@ -35,5 +33,6 @@ py_test(test_lookup_table SRCS test_lookup_table.py)
 py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
 py_test(test_sum_op SRCS test_sum_op.py)
 py_test(mnist SRCS mnist.py)
 py_test(test_concat_op SRCS test_concat_op.py)
 py_test(test_squared_l2_distance_op SRCS test_squared_l2_distance_op.py)
 py_test(test_reshape_op SRCS test_reshape_op.py)
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@ -9,7 +9,7 @@ def grad_var_name(var_name):
    return var_name + "@GRAD"
-def create_op(scope, op_type, inputs, outputs, attrs=None):
+def create_op(scope, op_type, inputs, outputs, attrs):
    kwargs = dict()
    for in_name, in_dup in Operator.get_op_inputs(op_type):
@ -17,7 +17,7 @@ def create_op(scope, op_type, inputs, outputs, attrs=None):
            kwargs[in_name] = []
            if in_dup:
                sub_in = inputs[in_name]
-                for sub_in_name in sub_in:
+                for sub_in_name, _ in sub_in:
                    var = scope.new_var(sub_in_name)
                    kwargs[in_name].append(sub_in_name)
            else:
@ -29,15 +29,16 @@ def create_op(scope, op_type, inputs, outputs, attrs=None):
            kwargs[out_name] = []
            if out_dup:
                sub_in = outputs[out_name]
-                for sun_in_name in sub_in:
+                for sub_in_name, _ in sub_in:
-                    var = scope.new_var(sun_in_name)
+                    var = scope.new_var(sub_in_name)
-                    kwargs[out_name].append(sun_in_name)
+                    kwargs[out_name].append(sub_in_name)
            else:
                var = scope.new_var(out_name)
                kwargs[out_name].append(out_name)
    for attr_name in Operator.get_op_attr_names(op_type):
-        kwargs[attr_name] = attrs[attr_name]
+        if attr_name in attrs:
            kwargs[attr_name] = attrs[attr_name]
    return Operator(op_type, **kwargs)
@ -46,12 +47,11 @@ def set_input(scope, op, inputs, place):
        if in_name in inputs:
            if in_dup:
                sub_in = inputs[in_name]
-                for sub_in_name in sub_in:
+                for sub_in_name, sub_in_array in sub_in:
                    var = scope.find_var(sub_in_name)
                    tensor = var.get_tensor()
-                    arr = sub_in[sub_in_name]
+                    tensor.set_dims(sub_in_array.shape)
-                    tensor.set_dims(arr.shape)
+                    tensor.set(sub_in_array, place)
                    tensor.set(arr, place)
            else:
                var = scope.find_var(in_name)
                tensor = var.get_tensor()
@ -65,7 +65,7 @@ def set_output_grad(scope, op, outputs, place):
        if out_name in outputs:
            if out_dup:
                sub_out = outputs[out_name]
-                for sub_out_name in sub_out:
+                for sub_out_name, _ in sub_out:
                    out_tensor = scope.find_var(sub_out_name).get_tensor()
                    grad_tensor = scope.new_var(grad_var_name(
                        sub_out_name)).get_tensor()
@ -110,7 +110,7 @@ def get_numeric_gradient(scope,
    # we use a for loop to compute the gradient of every element.
    for i in xrange(tensor_size):
        if in_place:
-            set_input(op, inputs, core.CPUPlace())
+            set_input(scope, op, inputs, core.CPUPlace())
        # get one input element throw it's index i.
        origin = tensor_to_check.get_float_element(i)
@ -120,7 +120,7 @@ def get_numeric_gradient(scope,
        y_pos = get_output()
        if in_place:
-            set_input(op, inputs, core.CPUPlace())
+            set_input(scope, op, inputs, core.CPUPlace())
        x_neg = origin - delta
        tensor_to_check.set_float_element(i, x_neg)
@ -168,7 +168,10 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
 class OpTest(unittest.TestCase):
    def check_output_with_place(self, place):
        self.scope = core.Scope()
-        self.op = create_op(self.scope, self.op_type, self.inputs, self.outputs)
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
        self.op = create_op(self.scope, self.op_type, op_inputs, self.outputs,
                            op_attrs)
        if isinstance(place, core.GPUPlace) and not self.op.support_gpu():
            return
        set_input(self.scope, self.op, self.inputs, place)
@ -227,7 +230,10 @@ class OpTest(unittest.TestCase):
                   in_place=False,
                   max_relative_error=0.005):
        self.scope = core.Scope()
-        self.op = create_op(self.scope, self.op_type, self.inputs, self.outputs)
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
        self.op = create_op(self.scope, self.op_type, op_inputs, self.outputs,
                            op_attrs)
        if no_grad_set is None:
            no_grad_set = set()
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@ -1,72 +0,0 @@
 import numpy
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
 class OpTestMeta(type):
    """
    Operator Test ClassMeta.
    It injects `test_all` method into user's OperatorTest class, to make Python
    unittest module run that method.
    The `test_all` read what value is stored in `self`. It use self's values to
    create and run a operator, and check whether that op is OK or not.
    See `test_add_two_op` for example usage.
    """
    def __new__(cls, name, bases, attrs):
        obj = super(OpTestMeta, cls).__new__(cls, name, bases, attrs)
        def test_all(self):
            scope = core.Scope()
            kwargs = dict()
            places = [core.CPUPlace()]
            if core.is_compile_gpu():
                places.append(core.GPUPlace(0))
            for place in places:
                for in_name in Operator.get_op_input_names(self.type):
                    if hasattr(self, "inputs") and in_name in self.inputs:
                        kwargs[in_name] = in_name
                        var = scope.new_var(in_name).get_tensor()
                        arr = self.inputs[in_name]
                        var.set_dims(arr.shape)
                        var.set(arr, place)
                    else:
                        kwargs[in_name] = "@EMPTY@"
                for out_name in Operator.get_op_output_names(self.type):
                    if not hasattr(self, "outputs"):
                        raise ValueError(
                            "The test op must set self.outputs dict.")
                    if out_name not in self.outputs:
                        raise ValueError("The %s is not in self.outputs dict." %
                                         (out_name))
                    kwargs[out_name] = out_name
                    scope.new_var(out_name).get_tensor()
                for attr_name in Operator.get_op_attr_names(self.type):
                    if hasattr(self, "attrs") and attr_name in self.attrs:
                        kwargs[attr_name] = self.attrs[attr_name]
                op = Operator(self.type, **kwargs)
                if isinstance(place, core.GPUPlace) and not op.support_gpu():
                    return
                op.infer_shape(scope)
                ctx = core.DeviceContext.create(place)
                op.run(scope, ctx)
                for out_name in Operator.get_op_output_names(self.type):
                    actual = numpy.array(scope.find_var(out_name).get_tensor())
                    expect = self.outputs[out_name]
                    self.assertTrue(
                        numpy.allclose(
                            actual, expect, atol=1e-05),
                        "output name: " + out_name + " has diff")
        obj.test_all = test_all
        return obj
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@ -1,23 +1,20 @@
 import unittest
 import numpy as np
 from op_test import OpTest
 import numpy
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
 from op_test_util import OpTestMeta
 class TestAddOp(unittest.TestCase):
    __metaclass__ = OpTestMeta
 class TestAddOp(OpTest):
    def setUp(self):
-        self.type = "add"
+        self.op_type = "add"
        self.inputs = {
-            'X': numpy.random.random((102, 105)).astype("float32"),
+            'X': np.random.random((102, 105)).astype("float32"),
-            'Y': numpy.random.random((102, 105)).astype("float32")
+            'Y': np.random.random((102, 105)).astype("float32")
        }
        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
    def test_check_output(self):
        self.check_output()
-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
@ -0,0 +1,22 @@
 import unittest
 import numpy as np
 from op_test import OpTest
 class TestConcatOp(OpTest):
    def setUp(self):
        self.op_type = "concat"
        x0 = np.random.random((2, 3, 2, 5)).astype('float32')
        x1 = np.random.random((2, 3, 3, 5)).astype('float32')
        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
        axis = 2
        self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]}
        self.attrs = {'axis': axis}
        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
    def test_check_output(self):
        self.check_output()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@ -1,17 +1,14 @@
 import unittest
 import numpy as np
-from gradient_checker import GradientChecker, create_op
+from op_test import OpTest
 from op_test_util import OpTestMeta
-class TestCosSimOp(unittest.TestCase):
+class TestCosSimOp(OpTest):
    __metaclass__ = OpTestMeta
    def setUp(self):
-        self.type = "cos_sim"
+        self.op_type = "cos_sim"
        self.inputs = {
-            'X': np.random.random((32, 64)).astype("float32"),
+            'X': np.random.random((10, 5)).astype("float32"),
-            'Y': np.random.random((32, 64)).astype("float32")
+            'Y': np.random.random((10, 5)).astype("float32")
        }
        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
@ -23,38 +20,20 @@ class TestCosSimOp(unittest.TestCase):
            'Out': np.expand_dims(expect_out, 1)
        }
    def test_check_output(self):
        self.check_output()
-class TestCosSimGradOp(GradientChecker):
+    def test_check_grad_normal(self):
-    def setUp(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
        self.op = create_op("cos_sim")
        self.inputs = {
            'X': np.random.random((10, 5)).astype("float32"),
            'Y': np.random.random((10, 5)).astype("float32")
        }
    def test_cpu_gpu_compare(self):
        self.compare_grad(self.op, self.inputs)
    def test_normal(self):
        self.check_grad(
            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.05)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
            self.inputs, ["Y"],
            "Out",
            max_relative_error=0.05,
            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ignore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
            self.inputs, ["X"],
            "Out",
            max_relative_error=0.05,
            no_grad_set={"Y"})
-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@ -21,7 +21,7 @@ class TestCrossEntropy(OpTest):
        self.check_output()
    def test_check_grad(self):
-        self.check_grad(["X"], "Y")
+        self.check_grad(['X'], 'Y')
 if __name__ == "__main__":
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
@ -1,16 +1,17 @@
 import unittest
-from op_test_util import OpTestMeta
+import numpy as np
-import numpy
+from op_test import OpTest
-class TestFillZerosLikeOp(unittest.TestCase):
+class TestFillZerosLikeOp(OpTest):
    __metaclass__ = OpTestMeta
    def setUp(self):
-        self.type = "fill_zeros_like"
+        self.op_type = "fill_zeros_like"
-        self.inputs = {'Src': numpy.random.random((219, 232)).astype("float32")}
+        self.inputs = {'Src': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Dst': numpy.zeros_like(self.inputs['Src'])}
+        self.outputs = {'Dst': np.zeros_like(self.inputs["Src"])}
    def test_check_output(self):
        self.check_output()
-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_gather_op.py
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
@ -1,30 +1,20 @@
 import unittest
-from op_test_util import OpTestMeta
+import numpy as np
-from gradient_checker import GradientChecker, create_op
+from op_test import OpTest
 import numpy
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
-class TestGatherOp(unittest.TestCase):
+class TestGatherOp(OpTest):
    __metaclass__ = OpTestMeta
    def setUp(self):
-        self.type = "gather"
+        self.op_type = "gather"
-        xnp = numpy.random.random((10, 20)).astype("float32")
+        xnp = np.random.random((10, 20)).astype("float32")
-        self.inputs = {
+        self.inputs = {'X': xnp, 'Index': np.array([1, 3, 5]).astype("int32")}
-            'X': xnp,
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
            'Index': numpy.array([1, 3, 5]).astype("int32")
        }
        self.outputs = {'Out': self.inputs['X'][self.inputs['Index']]}
    def test_check_output(self):
        self.check_output()
-class TestGatherGradOp(GradientChecker):
+    def test_check_grad(self):
-    def test_gather_grad(self):
+        self.check_grad(['X'], 'Out')
        op = create_op("gather")
        xnp = numpy.random.random((10, 20)).astype("float32")
        inputs = {'X': xnp, 'Index': numpy.array([1, 3, 5]).astype("int32")}
        self.check_grad(op, inputs, set("X"), "Out")
 if __name__ == "__main__":
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@ -14,11 +14,11 @@ class GaussianRandomTest(unittest.TestCase):
    def gaussian_random_test(self, place):
        scope = core.Scope()
-        scope.new_var("Out").get_tensor()
+        scope.new_var('Out').get_tensor()
        op = Operator(
            "gaussian_random",
-            Out="Out",
+            Out='Out',
            dims=[1000, 784],
            mean=.0,
            std=1.,
@ -27,10 +27,10 @@ class GaussianRandomTest(unittest.TestCase):
        op.infer_shape(scope)
        context = core.DeviceContext.create(place)
        op.run(scope, context)
-        tensor = numpy.array(scope.find_var("Out").get_tensor())
+        tensor = numpy.array(scope.find_var('Out').get_tensor())
        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
@ -1,42 +1,44 @@
 import unittest
-import numpy
+import numpy as np
-from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
-from gradient_checker import GradientChecker
+from op_test import get_numeric_gradient
-from gradient_checker import get_numeric_gradient
+from op_test import create_op
 class GetNumericGradientTest(unittest.TestCase):
    def test_add_op(self):
-        add_op = Operator("add", X="X", Y="Y", Out="Z")
+        x = np.random.random((10, 1)).astype("float32")
-        x = numpy.random.random((10, 1)).astype("float32")
+        y = np.random.random((10, 1)).astype("float32")
-        y = numpy.random.random((10, 1)).astype("float32")
+        z = x + y
-
+        scope = core.Scope()
-        arr = get_numeric_gradient(add_op, {"X": x, "Y": y}, "Z", "X")
+        add_op = create_op(scope, "add", {'X': x, 'Y': y}, {'Out': z}, dict())
        arr = get_numeric_gradient(scope, add_op, {'X': x, 'Y': y}, 'X', 'Out')
        self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-4)
    def test_softmax_op(self):
        def stable_softmax(x):
            """Compute the softmax of vector x in a numerically stable way."""
-            shiftx = x - numpy.max(x)
+            shiftx = x - np.max(x)
-            exps = numpy.exp(shiftx)
+            exps = np.exp(shiftx)
-            return exps / numpy.sum(exps)
+            return exps / np.sum(exps)
        def label_softmax_grad(Y, dY):
            dX = Y * 0.0
            for i in range(Y.shape[0]):
-                d = numpy.dot(Y[i, :], dY[i, :])
+                d = np.dot(Y[i, :], dY[i, :])
                dX[i, :] = Y[i, :] * (dY[i, :] - d)
            return dX
-        softmax_op = Operator("softmax", X="X", Y="Y")
+        X = np.random.random((2, 2)).astype("float32")
-
+        Y = np.apply_along_axis(stable_softmax, 1, X)
-        X = numpy.random.random((2, 2)).astype("float32")
+        dY = np.ones(Y.shape)
        Y = numpy.apply_along_axis(stable_softmax, 1, X)
        dY = numpy.ones(Y.shape)
        dX = label_softmax_grad(Y, dY)
-        arr = get_numeric_gradient(softmax_op, {"X": X}, "Y", "X")
+        scope = core.Scope()
-        numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2)
+        softmax_op = create_op(scope, "softmax", {"X": X}, {"Y": Y}, dict())
        arr = get_numeric_gradient(scope, softmax_op, {"X": X}, "X", "Y")
        np.testing.assert_almost_equal(arr, dX, decimal=1e-2)
 if __name__ == "__main__":
--- a/python/paddle/v2/framework/tests/test_lookup_table.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table.py
@ -1,31 +1,22 @@
 import unittest
 import numpy as np
-from op_test_util import OpTestMeta
+from op_test import OpTest
 from gradient_checker import GradientChecker, create_op
-class TestLookupTableOp(unittest.TestCase):
+class TestLookupTableOp(OpTest):
    __metaclass__ = OpTestMeta
    def setUp(self):
-        self.type = 'lookup_table'
+        self.op_type = "lookup_table"
-        table = np.random.random((17, 31)).astype('float32')
+        table = np.random.random((17, 31)).astype("float32")
-        ids = np.random.randint(0, 17, 4).astype('int32')
+        ids = np.random.randint(0, 17, 4).astype("int32")
        self.inputs = {'W': table, 'Ids': ids}
        self.outputs = {'Out': table[ids]}
    def test_check_output(self):
        self.check_output()
-class TestLookupTableGradOp(GradientChecker):
+    def test_check_grad(self):
-    def test_grad(self):
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
        op = create_op('lookup_table')
        table = np.random.random((17, 31)).astype('float32')
        ids = np.random.randint(0, 17, 4).astype('int32')
        inputs = {'W': table, 'Ids': ids}
        # comapre gradients 
        self.compare_grad(op, inputs, set(['Ids']))
        # check gradients 
        self.check_grad(op, inputs, set('W'), 'Out')
-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()
--- a/Show More
+++ b/Show More