Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_add_FillZerosLikeOp_test

8 years ago · df84e39a00
parent 589c3d864b 0bd49a50ce
commit df84e39a00
78 changed files with 1508 additions and 633 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -17,10 +17,14 @@
    -   id: detect-private-key
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
    -   id: end-of-file-fixer
-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
-    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+-   repo: local
    hooks:
-    -   id: clang-formater
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: clang-format -i
+        language: system
+        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
--- a/2
+++ b/2
@ -27,7 +27,7 @@ RUN apt-get update && \
    git python-pip python-dev openssh-server bison  \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc g++ \
+    python-numpy python-matplotlib gcc-4.8 g++-4.8 \
    automake locales clang-format-3.8 swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
--- a/README.md
+++ b/README.md
@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and

 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)

-  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in Jupyter Notebook.

 - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -20,34 +20,30 @@ INCLUDE(ExternalProject)

 SET(MKLDNN_PROJECT        "extern_mkldnn")
 SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
-SET(MKLDNN_INSTALL_ROOT   ${CMAKE_INSTALL_PREFIX})
-IF(NOT "$ENV{HOME}" STREQUAL "/root")
-    SET(MKLDNN_INSTALL_ROOT  "$ENV{HOME}")
-ENDIF()
-
-SET(MKLDNN_INSTALL_DIR    "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
-SET(MKLDNN_INCLUDE_DIR    "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
+SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)

-IF(WIN32)
-    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
-      "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF)
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING 
+        "Windows or Mac is not supported with MKLDNN in Paddle yet."
+        "Force WITH_MKLDNN=OFF")
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
    return()
-ELSE(WIN32)
-    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
-    MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
-    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
-    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
-ENDIF(WIN32)
+ENDIF()
+
+SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")

-INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})

 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()

 ExternalProject_Add(
@ -57,16 +53,15 @@ ExternalProject_Add(
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
    GIT_TAG             "v0.9"
    PREFIX              ${MKLDNN_SOURCES_DIR}
-    CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
-    BUILD_COMMAND       cd <SOURCE_DIR>/build
-                        && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
-                        && $(MAKE)
-    INSTALL_COMMAND     cd <SOURCE_DIR>/build && $(MAKE) install
    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
+                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
 )

 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
+MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -16,19 +16,23 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})

+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING 
+        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Force WITH_MKLML=OFF")
+    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)

 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
+SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
 SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
-SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
-SET(MKLML_INSTALL_ROOT  "${CMAKE_INSTALL_PREFIX}")
-IF(NOT "$ENV{HOME}" STREQUAL "/root")
-    SET(MKLML_INSTALL_ROOT  "$ENV{HOME}")
-ENDIF()
-
+SET(MKLML_DST_DIR       "mklml")
+SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        endif()
+        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
+        # Use Debug mode instead for now.
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
+            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+        endif()
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -104,6 +104,11 @@ cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
    :noindex:
+
+row_l2_norm
+-----------
+..  autoclass:: paddle.v2.layer.row_l2_norm
+    :noindex:
    
 Recurrent Layers
 ================
@ -320,6 +325,11 @@ scaling
 ..  autoclass:: paddle.v2.layer.scaling
    :noindex:

+clip
+----
+..  autoclass:: paddle.v2.layer.clip
+    :noindex:
+
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -15,7 +15,6 @@ if(Boost_FOUND)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
-  add_subdirectory(pybind)
 endif()

 if(WITH_C_API)
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@ -1022,6 +1022,15 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  real alpha = 1.0f;
  real beta = 1.0f;
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+
+  int batch_size = ((cudnn_tensor_descriptor)inputDesc)->batch_size;
+  if (batch_size > 1024 && g_cudnn_lib_version < 6000) {
+    LOG(INFO) << " To process current batch data with size " << batch_size
+              << " (>1024), cudnnBatchNorm requires cuDNN version >= 6000."
+              << " If there is an error complaining CUDNN_STATUS_NOT_SUPPORTED,"
+              << " just recompile PaddlePaddle with cuDNN >= 6000, replacing"
+              << " current version " << g_cudnn_lib_version;
+  }
  CHECK_CUDNN(
      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
                                                       mode,
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -31,8 +31,14 @@ py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)

-cc_library(net SRCS net.cc DEPS op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net)
-
-cc_library(backward SRCS backward.cc DEPS net)
+cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python backward
+	fc_op
+	sgd_op
+	add_op
+	mean_op
+	cross_entropy_op
+	recurrent_op)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -14,8 +14,8 @@

 #include "paddle/framework/backward.h"
 #include <list>
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"

 namespace paddle {
 namespace framework {
@ -32,7 +32,7 @@ static bool AllInSet(const std::vector<std::string>& names,
 }

 static std::shared_ptr<OperatorBase> NOP() {
-  auto net_op = std::make_shared<NetOp>();
+  auto net_op = std::make_shared<operators::NetOp>();
  net_op->type_ = "@NOP@";
  net_op->CompleteAddOp();
  return net_op;
@ -77,11 +77,11 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
  }

  // Returned gradient network
-  auto net = std::make_shared<NetOp>();
+  auto net = std::make_shared<operators::NetOp>();

  if (forwardOp.IsNetOp()) {
    // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const NetOp&>(forwardOp);
+    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);

    // Map from output gradient variable name to operator's indices in backward
    // net. That operator generates that variable.
@ -168,6 +168,9 @@ std::shared_ptr<OperatorBase> Backward(
  std::unordered_set<std::string> no_grad_names;
  no_grad_names.reserve(no_grad_vars.size());

+  no_grad_names.insert(OperatorBase::EMPTY_VAR_NAME() +
+                       OperatorBase::GRAD_VAR_SUFFIX());
+
  for (auto& name : no_grad_vars) {
    no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
  }
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@ -15,8 +15,9 @@
 #include "paddle/framework/backward.h"

 #include <gtest/gtest.h>
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/type_alias.h"

 namespace paddle {
 namespace framework {
@ -70,7 +71,7 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
  }
 };

-class FcOp : public NetOp {
+class FcOp : public ops::NetOp {
 public:
  void Init() override {
    AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")},
@ -182,7 +183,8 @@ TEST(Backward, simple_op_not_need_grad) {
  auto no_input_gop = f::Backward(*fwd, {"X", "b"});
  ASSERT_NE(no_input_gop, nullptr);
  ASSERT_TRUE(no_input_gop->IsNetOp());
-  ASSERT_EQ(0UL, std::static_pointer_cast<f::NetOp>(no_input_gop)->ops_.size());
+  ASSERT_EQ(0UL,
+            std::static_pointer_cast<ops::NetOp>(no_input_gop)->ops_.size());
 }

 TEST(Backward, net_fc_backward_normal) {
@ -191,7 +193,7 @@ TEST(Backward, net_fc_backward_normal) {
  ASSERT_NE(fwd, nullptr);
  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<f::NetOp *>(gop.get());
+  auto net = static_cast<ops::NetOp *>(gop.get());

  ASSERT_NO_THROW(net->DebugString());

@ -214,7 +216,7 @@ TEST(Backward, net_fc_backward_not_have_b) {
  ASSERT_NE(fwd, nullptr);
  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<f::NetOp *>(gop.get());
+  auto net = static_cast<ops::NetOp *>(gop.get());

  ASSERT_NO_THROW(net->DebugString());

@ -228,7 +230,7 @@ TEST(Backward, net_fc_backward_not_have_b) {
 }

 TEST(Backward, net_input_of_network_not_need_grad) {
-  f::NetOp net;
+  ops::NetOp net;
  net.AddOp(f::OpRegistry::CreateOp("fc", {"X", "W1", "b1"},
                                    {"mul_tmp_0", "add_tmp_0", "hidden0"}, {}));
  net.AddOp(f::OpRegistry::CreateOp("fc", {"hidden0", "W2", "b2"},
@ -236,7 +238,7 @@ TEST(Backward, net_input_of_network_not_need_grad) {
  net.CompleteAddOp();
  auto bwd = Backward(net, {"X"});  // X@GRAD is not need.
  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());

  std::unordered_set<std::string> all_output = std::unordered_set<std::string>(
      bwd_net->outputs_.begin(), bwd_net->outputs_.end());
@ -253,7 +255,7 @@ TEST(Backward, net_input_of_network_not_need_grad) {

  ASSERT_EQ(2UL, bwd_net->ops_.size());
  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
-  auto first_fc_grad = static_cast<f::NetOp *>(bwd_net->ops_[1].get());
+  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
  ASSERT_EQ(
      f::OperatorBase::EMPTY_VAR_NAME(),
@ -261,14 +263,14 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 }

 TEST(Backward, net_shared_weight) {
-  f::NetOp net;
+  ops::NetOp net;
  net.AddOp(f::OpRegistry::CreateOp("mul", {"X", "W"}, {"Out"}, {}));
  net.AddOp(f::OpRegistry::CreateOp("mul", {"Out", "W"}, {"FinalOut"}, {}));
  net.CompleteAddOp();

  auto bwd = f::Backward(net, {});
  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
  ASSERT_EQ(3UL, bwd_net->ops_.size());
  ASSERT_EQ("add", bwd_net->ops_[2]->type_);
 }
@ -285,7 +287,7 @@ TEST(Backward, op_all_input_are_not_need) {
  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
  auto backward = f::Backward(*fwd, {"X", "b"});
  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<f::NetOp *>(backward.get());
+  auto net = static_cast<ops::NetOp *>(backward.get());
  ASSERT_TRUE(net->ops_.empty());
 }

@ -293,7 +295,7 @@ TEST(Backward, op_all_output_are_not_need) {
  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
  auto backward = f::Backward(*fwd, {"Out"});
  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<f::NetOp *>(backward.get());
+  auto net = static_cast<ops::NetOp *>(backward.get());
  ASSERT_TRUE(net->ops_.empty());
 }

@ -301,7 +303,7 @@ TEST(Backward, op_part_of_output_are_not_need) {
  auto fwd = f::OpRegistry::CreateOp("many_output_op", {"X"}, {"Y", "Z"}, {});
  auto backward = f::Backward(*fwd, {"Z"});
  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<f::NetOp *>(backward.get());
+  auto net = static_cast<ops::NetOp *>(backward.get());
  ASSERT_EQ(net->ops_.size(), 2UL);

  auto &fill_zero = *net->ops_[0];
@ -341,7 +343,7 @@ TEST(Backward, op_part_of_input_are_not_need) {
 }

 TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
-  f::NetOp net;
+  ops::NetOp net;
  net.AddOp(f::OpRegistry::CreateOp("fc", {"x1", "w1", "b1"},
                                    {"mul_out1", "add_out1", "out1"}, {}));
  net.AddOp(f::OpRegistry::CreateOp("fc", {"out1", "w2", "b2"},
@ -351,7 +353,7 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
  net.CompleteAddOp();
  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
  ASSERT_TRUE(backward->IsNetOp());
-  auto bwd_net = static_cast<f::NetOp *>(backward.get());
+  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
  auto &grad_fc = *bwd_net->ops_[0];
  EXPECT_EQ(grad_fc.inputs_.size(),
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -400,6 +400,14 @@ class GradOpRegisterHelper {
    return 0;                                                                  \
  }

+/**
+ * Macro to Forbid user register Gradient Operator.
+ */
+#define NO_GRADIENT(__op_type)                          \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                       \
+      __reg_gradient_op__##__op_type##__op_type##_grad, \
+      "NO_GRADIENT must be in global namespace")
+
 /**
 * Macro to Register OperatorKernel.
 */
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -20,16 +20,16 @@ namespace paddle {
 namespace framework {

 template <>
-Eigen::DefaultDevice* ExecutionContext::GetEigenDevice<
+Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }

 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice*
+Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif

@ -52,7 +52,8 @@ std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr");
  auto input_format = GetAttr<std::vector<int>>("input_format");
  auto offset = in_out_idxs_->at(name);
-  PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= (int)inputs_.size(),
+  PADDLE_ENFORCE(input_format.at(static_cast<size_t>(offset) + 1) <=
+                     static_cast<int>(inputs_.size()),
                 "Input Out Of Range");

  return std::vector<std::string>{
@ -78,7 +79,8 @@ std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
  auto output_format = GetAttr<std::vector<int>>("output_format");
  auto offset = in_out_idxs_->at(name);
-  PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= (int)outputs_.size(),
+  PADDLE_ENFORCE(output_format.at(static_cast<size_t>(offset) + 1) <=
+                     static_cast<int>(outputs_.size()),
                 "Output Out of Range");
  return std::vector<std::string>{
      outputs_.begin() + output_format.at(offset),
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -55,6 +55,10 @@ class OperatorBase {
  /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
  static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }

+  static std::string GRAD_VAR_NAME(const std::string& name) {
+    return name + GRAD_VAR_SUFFIX();
+  }
+
  /// Variables with this suffix are supposed to be filled up with zeros.
  static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }

@ -161,22 +165,30 @@ class OperatorContext {

  template <typename T>
  const T* Input(const size_t index) const {
-    return &(InputVar(index)->Get<T>());
+    auto var = InputVar(index);
+    PADDLE_ENFORCE(var != nullptr, "Input(%d) should not be nullptr", index);
+    return &var->Get<T>();
  }

  template <typename T>
  T* Output(const size_t index) const {
-    return OutputVar(index)->GetMutable<T>();
+    auto var = OutputVar(index);
+    PADDLE_ENFORCE(var != nullptr, "Output(%d) should not be nullptr", index);
+    return var->GetMutable<T>();
  }

  template <typename T>
  const T* Input(const std::string& name) const {
-    return &(InputVar(name)->Get<T>());
+    auto var = InputVar(name);
+    PADDLE_ENFORCE(var != nullptr, "Input(%s) should not be nullptr", name);
+    return &var->Get<T>();
  }

  template <typename T>
  T* Output(const std::string& name) const {
-    return OutputVar(name)->GetMutable<T>();
+    auto var = OutputVar(name);
+    PADDLE_ENFORCE(var != nullptr, "Output(%s) should not be nullptr", name);
+    return var->GetMutable<T>();
  }

  template <typename T>
@ -185,8 +197,12 @@ class OperatorContext {
    std::vector<const T*> res;
    res.reserve(names.size());
    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return &scope_.FindVar(name)->Get<T>();
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     PADDLE_ENFORCE(var != nullptr,
+                                    "MultiInput(%s:%s) should not be nullptr",
+                                    name, sub_name);
+                     return &var->Get<T>();
                   });
    return res;
  }
@ -197,8 +213,12 @@ class OperatorContext {
    std::vector<const T*> res;
    res.reserve(names.size());
    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return scope_.FindVar(name)->GetMutable<T>();
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     PADDLE_ENFORCE(var != nullptr,
+                                    "MultiOutput(%s:%s) should not be nullptr",
+                                    name, sub_name);
+                     return var->GetMutable<T>();
                   });
    return res;
  }
@ -237,7 +257,7 @@ class ExecutionContext : public OperatorContext {
  template <typename PlaceType,
            typename DeviceType =
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
+  DeviceType& GetEigenDevice() const;

  platform::Place GetPlace() const { return device_context_.GetPlace(); }

--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <Python.h>
+#include <fstream>
+#include <vector>
+
+#include "paddle/framework/backward.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor_py.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/type_alias.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+USE_OP(add_two);
+USE_OP(onehot_cross_entropy);
+USE_OP_WITHOUT_KERNEL(fc);
+USE_OP(sgd);
+USE_OP(mul);
+USE_OP(mean);
+USE_OP(sigmoid);
+USE_OP(softmax);
+USE_OP(rowwise_add);
+USE_OP_WITHOUT_KERNEL(recurrent_op);
+namespace paddle {
+namespace framework {
+template <typename ClassType>
+void ExposeOperator(ClassType &m) {
+  m.def("infer_shape", &ClassType::type::InferShape)
+      .def("run", &ClassType::type::Run)
+      .def("type",
+           [](const typename ClassType::type &op) -> std::string {
+             return op.type_;
+           })
+      .def("outputs",
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
+             return op.outputs_;
+           })
+      .def("__str__", &ClassType::type::DebugString);
+}
+
+static size_t UniqueIntegerGenerator() {
+  static std::atomic<size_t> generator;
+  return generator.fetch_add(1);
+}
+
+bool IsCompileGPU() {
+#ifdef PADDLE_ONLY_CPU
+  return false;
+#else
+  return true;
+#endif
+}
+
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of PaddlePaddle");
+
+  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer(
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def("get_dims",
+           [](const Tensor &self) { return vectorize(self.dims()); })
+      .def("set_dims",
+           [](Tensor &self, const std::vector<int> &dim) {
+             self.Resize(make_ddim(dim));
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::GPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::GPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("set", PyCPUTensorSetFromArray<float>)
+      .def("set", PyCPUTensorSetFromArray<int>)
+#ifndef PADDLE_ONLY_CPU
+      .def("set", PyCUDATensorSetFromArray<float>)
+      .def("set", PyCUDATensorSetFromArray<int>)
+#endif
+      .def("shape", [](Tensor &self) { return vectorize(self.dims()); });
+
+  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
+
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
+      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
+      .def("get_tensor",
+           [](Variable &self) -> Tensor * { return self.GetMutable<Tensor>(); },
+           py::return_value_policy::reference)
+      .def("get_net",
+           [](Variable &self) -> ops::NetOp * {
+             return self.GetMutable<ops::NetOp>();
+           },
+           py::return_value_policy::reference);
+
+  py::class_<Scope>(m, "Scope", "")
+      .def("new_var",
+           [](Scope &self, const std::string &name) -> Variable * {
+             return self.NewVar(name);
+           },
+           py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
+      .def(py::init<>())
+      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+           py::return_value_policy::reference)
+      .def("drop_kids", &Scope::DropKids);
+
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
+  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
+    auto &protos = OpRegistry::protos();
+    std::vector<py::bytes> ret_values;
+    for (auto it = protos.begin(); it != protos.end(); ++it) {
+      PADDLE_ENFORCE(it->second.IsInitialized(),
+                     "OpProto must all be initialized");
+      std::string str;
+      PADDLE_ENFORCE(it->second.SerializeToString(&str),
+                     "Serialize OpProto Error. This could be a bug of Paddle.");
+      ret_values.push_back(py::bytes(str));
+    }
+    return ret_values;
+  });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", OperatorBase::EMPTY_VAR_NAME)
+      .def("temp", OperatorBase::TMP_VAR_NAME);
+  // clang-format off
+  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
+      .def_static("create",
+                  [](paddle::platform::CPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+                    return new paddle::platform::CPUDeviceContext();
+                  })
+      .def_static("create",
+                  [](paddle::platform::GPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifdef PADDLE_ONLY_CPU
+                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+#else
+                    return new paddle::platform::CUDADeviceContext(place);
+#endif
+                  });
+  // clang-format on
+
+  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
+
+  py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
+      m, "Operator");
+
+  operator_base.def_static("create", [](py::bytes protobin) {
+    OpDesc desc;
+    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                   "Cannot parse user input to OpDesc");
+    PADDLE_ENFORCE(desc.IsInitialized(),
+                   "User OpDesc is not initialized, reason %s",
+                   desc.InitializationErrorString());
+    return OpRegistry::CreateOp(desc);
+  });
+
+  operator_base.def("backward",
+                    [](const OperatorBase &forwardOp,
+                       const std::unordered_set<std::string> &no_grad_vars) {
+                      return Backward(forwardOp, no_grad_vars);
+                    });
+
+  ExposeOperator(operator_base);
+
+  py::class_<ops::NetOp, std::shared_ptr<ops::NetOp>> net(m, "Net");
+
+  net.def_static("create",
+                 []() -> std::shared_ptr<ops::NetOp> {
+                   auto retv = std::make_shared<ops::NetOp>();
+                   retv->type_ = "plain_net";
+                   return retv;
+                 })
+      .def("add_op", &ops::NetOp::AddOp)
+      .def(
+          "add_op",
+          [](ops::NetOp &self, const std::shared_ptr<ops::NetOp> &net) -> void {
+            self.AddOp(std::static_pointer_cast<OperatorBase>(net));
+          })
+      .def("complete_add_op", &ops::NetOp::CompleteAddOp)
+      .def("complete_add_op",
+           [](std::shared_ptr<ops::NetOp> &self) { self->CompleteAddOp(); });
+
+  ExposeOperator(net);
+
+  m.def("unique_integer", UniqueIntegerGenerator);
+
+  m.def("is_compile_gpu", IsCompileGPU);
+
+  return m.ptr();
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -26,19 +26,17 @@ limitations under the License. */
 #include "unsupported/Eigen/CXX11/Tensor"

 namespace paddle {
-namespace pybind {
-namespace details {  // forward declare
-template <bool less, size_t i, typename... args>
-struct CastToPyBufferImpl;
-}  // namespace details
-}  // namespace pybind

 namespace framework {
+namespace details {
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}

 class Tensor {
 public:
  template <bool less, size_t i, typename... args>
-  friend struct paddle::pybind::details::CastToPyBufferImpl;
+  friend struct details::CastToPyBufferImpl;

  template <typename T, size_t D, int MajorType, typename IndexType>
  friend struct EigenTensor;
@ -167,4 +165,4 @@ class Tensor {
 }  // namespace framework
 }  // namespace paddle

-#include "paddle/framework/detail/tensor-inl.h"
+#include "paddle/framework/tensor_impl.h"
--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-
 #include "paddle/memory/memcpy.h"

 namespace paddle {
@ -62,9 +61,11 @@ inline T* Tensor::mutable_data(platform::Place place) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size));
+    } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_ONLY_CPU
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
    }
-#ifndef PADDLE_ONLY_CPU
-    else if (platform::is_gpu_place(place)) {
+#else
      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
          boost::get<platform::GPUPlace>(place), size));
    }
--- a/paddle/framework/tensor_py.h
+++ b/paddle/framework/tensor_py.h
@ -13,15 +13,17 @@
   limitations under the License. */

 #pragma once
-#include <paddle/framework/tensor.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
+#include <string>
+#include "paddle/framework/tensor.h"
+#include "paddle/memory/memcpy.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"

 namespace py = pybind11;

 namespace paddle {

-namespace pybind {
+namespace framework {

 namespace details {

@ -40,9 +42,6 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
  py::buffer_info operator()(framework::Tensor &tensor) {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
-                   "Only CPU tensor can cast to numpy array");
-
    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
@ -56,14 +55,16 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
-
+      framework::Tensor dst_tensor;
+      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
+        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
+      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+        dst_tensor = tensor;
+      }
      return py::buffer_info(
-          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
-          sizeof(CUR_TYPE),
-          py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(tensor.dims()),
-          dims_outside,
-          strides);
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
+          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@ -77,9 +78,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }

 template <typename T>
-void PyTensorSetFromArray(
+void PyCPUTensorSetFromArray(
    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
  std::vector<int> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
@ -87,9 +89,28 @@ void PyTensorSetFromArray(
  }

  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  auto *dst = self.mutable_data<T>(place);
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }

+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::GPUPlace &place) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
+                                  cudaMemcpyHostToDevice);
+}
+#endif
+
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@ -109,6 +109,13 @@ protected:
    return filter[filter.ndims() - 1];
  }

+  // determine whether im2col needs to be performed
+  inline bool isNeedIm2col(const TensorShape& filter) const {
+    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
+             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
+             paddingW() == 0);
+  }
+
  std::vector<size_t> strides_;
  std::vector<size_t> paddings_;

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@ -66,16 +66,23 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});

-    resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }

    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@ -86,15 +93,18 @@ public:

    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
-               imShape,
-               colData,
-               colShape,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW());
-
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
        int M = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int K = inputChannels / groups_ * filterHeight * filterWidth;
@ -159,19 +169,27 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});

-    resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }

    Col2ImFunctor<kCFO, Device, real> col2im;
    GemmFunctor<Device, real> gemm;
+
    size_t inputOffset = imShape.getElements();
    size_t outputOffset =
        (outputChannels / groups_) * outputHeight * outputWidth;
@ -182,6 +200,11 @@ public:
        int K = outputChannels / groups_;
        int N = outputHeight * outputWidth;
        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
        gemm(CblasTrans,
             CblasNoTrans,
             M,
@ -192,17 +215,19 @@ public:
             M,
             outputGrad + g * outputOffset,
             N,
-             0.0f,
+             scale,
             colData,
             N);
-        col2im(inputGrad + g * inputOffset,
-               imShape,
-               colData,
-               colShape,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW());
+        if (needIm2col) {
+          col2im(inputGrad + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        }
      }
      inputGrad += inputChannels * inputHeight * inputWidth;
      outputGrad += outputChannels * outputHeight * outputWidth;
@ -255,16 +280,23 @@ public:
    real* outputGrad = inputs[0].data<real>();
    real* inputData = inputs[1].data<real>();
    real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape = TensorShape({inputChannels / groups_,
-                                        filterHeight,
-                                        filterWidth,
-                                        outputHeight,
-                                        outputWidth});

-    resizeBuffer<Device>(colShape.getElements());
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }

    Im2ColFunctor<kCFO, Device, real> im2col;
    GemmFunctor<Device, real> gemm;
@ -274,15 +306,18 @@ public:
    size_t filterOffset = filter.getElements() / groups_;
    for (size_t i = 0; i < batchSize; i++) {
      for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
-               imShape,
-               colData,
-               colShape,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW());
-
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
        int M = outputChannels / groups_;
        int K = outputHeight * outputWidth;
        int N = inputChannels / groups_ * filterHeight * filterWidth;
--- a/paddle/gserver/layers/ClipLayer.cpp
+++ b/paddle/gserver/layers/ClipLayer.cpp
@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for clipping the input value by the threshold.
+ * \f[
+ *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+ * \f]
+ */
+
+class ClipLayer : public Layer {
+protected:
+  double min_;
+  double max_;
+
+public:
+  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(clip, ClipLayer);
+
+bool ClipLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+  auto layerConf = config_.inputs(0).clip_conf();
+  min_ = layerConf.min();
+  max_ = layerConf.max();
+  CHECK_LT(min_, max_);
+  return true;
+}
+
+void ClipLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(*inV);
+  outV->clip(min_, max_);
+}
+
+void ClipLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  if (inG) {
+    MatrixPtr outV = getOutputValue();
+    MatrixPtr outG = getOutputGrad();
+    MatrixPtr tmpMtx;
+    Matrix::resizeOrCreate(
+        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
+    tmpMtx->clipDerivative(*inV, min_, max_);
+    inG->addDotMul(*outG, *tmpMtx, 1, 1);
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for L2 normalization in each row,
+ * \f[
+ *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+ * \f]
+ * where the size of \f$in\f$ is (batchSize x dataDim),
+ * and the size of \f$out\f$ is (batchSize x dataDim).
+ */
+
+class RowL2NormLayer : public Layer {
+protected:
+  MatrixPtr inSquare_;
+  MatrixPtr l2NormReciprocal_;
+  MatrixPtr dotSum_;
+
+public:
+  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
+
+bool RowL2NormLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void RowL2NormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+  CHECK_EQ(dataDim, inV->getWidth());
+  resetOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
+  inV->square2(*inSquare_);
+  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
+  inSquare_->rowSum(*l2NormReciprocal_);
+  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
+  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
+  outV->rowScale(0, *inV, *l2NormReciprocal_);
+}
+
+void RowL2NormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+  size_t batchSize = inV->getHeight();
+
+  // inG[ij] += outG[ij] / l2NormReciprocal
+  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
+  // inV[i])
+  if (inG) {
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    inSquare_->rowScale(0, *inV, *dotSum_);
+    inG->sub(*inSquare_);
+    inG->addRowScale(0, *outG, *l2NormReciprocal_);
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -1899,6 +1899,36 @@ TEST(Layer, CropLayer) {
  }
 }

+TEST(Layer, ClipLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("clip");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ClipConfig* layerConf = input->mutable_clip_conf();
+  double p1 = std::rand() / (double)RAND_MAX;
+  double p2 = std::rand() / (double)RAND_MAX;
+  layerConf->set_min(std::min(p1, p2));
+  layerConf->set_max(std::max(p1, p2));
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
--- a/Show More
+++ b/Show More