Merge remote-tracking branch 'origin/develop' into random_op

8 years ago · c110f56574
parent 5ad9474bf7 1294b3c53e
commit c110f56574
60 changed files with 1966 additions and 399 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,6 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
 option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@ -74,6 +76,10 @@ if(ANDROID)
        "Disable PYTHON when cross-compiling for Android" FORCE)
    set(WITH_RDMA OFF CACHE STRING
        "Disable RDMA when cross-compiling for Android" FORCE)
    set(WITH_MKLDNN OFF CACHE STRING
        "Disable MKLDNN when cross-compiling for Android" FORCE)
    set(WITH_MKLML OFF CACHE STRING
        "Disable MKLML package when cross-compiling for Android" FORCE)
 endif(ANDROID)
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@ -87,6 +93,7 @@ endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@ -94,6 +101,7 @@ include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
@ -135,6 +143,10 @@ if(WITH_GPU)
    endif(NOT WITH_DSO)
 endif(WITH_GPU)
 if(WITH_MKLDNN)
    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
 endif()
 if(USE_NNPACK)
    include(external/nnpack)
    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -15,23 +15,44 @@
 set(CBLAS_FOUND OFF)
-## Find MKL First.
+## Find MKLML First.
-set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
+if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
-set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
+  set(CBLAS_FOUND ON)
  set(CBLAS_PROVIDER MKLML)
  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
  set(CBLAS_LIBRARIES ${MKLML_LIB})
  add_definitions(-DPADDLE_USE_MKLML)
  add_definitions(-DLAPACK_FOUND)
  message(STATUS "Found cblas and lapack in MKLML "
    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
  return()
 endif()
 ## Then find MKL.
 set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
 set(MKL_INCLUDE_SEARCH_PATHS
  ${MKL_ROOT}/include
  ${INTEL_MKL_ROOT}/include)
 set(MKL_LIB_SEARCH_PATHS
  ${MKL_ROOT}/lib
  ${MKL_ROOT}/lib/intel64
  ${INTEL_MKL_ROOT}/lib
  ${INTEL_MKL_ROOT}/lib/intel64)
 find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_ROOT}/lib
+  ${MKL_LIB_SEARCH_PATHS})
  ${MKL_ROOT}/lib/intel64)
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_ROOT}/lib
+  ${MKL_LIB_SEARCH_PATHS})
  ${MKL_ROOT}/lib/intel64)
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_ROOT}/lib
+  ${MKL_LIB_SEARCH_PATHS})
  ${MKL_ROOT}/lib/intel64)
 if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
  set(CBLAS_FOUND ON)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -67,6 +67,30 @@ else()
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 if(WITH_MKLDNN)
    add_definitions(-DPADDLE_USE_MKLDNN)
    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
        set(OPENMP_FLAGS "-fopenmp")
        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
    else()
        find_package(OpenMP)
        if(OPENMP_FOUND)
            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
        else()
            message(WARNING "Can not find OpenMP."
                 "Some performance features in MKLDNN may not be available")
        endif()
    endif()
 endif(WITH_MKLDNN)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@ -34,9 +34,15 @@ IF(WITH_TESTING)
            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
    ENDIF(WIN32)
    IF(WITH_MKLML)
        # wait for mklml downloading completed
        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
    ENDIF()
    ExternalProject_Add(
        extern_gtest
        ${EXTERNAL_PROJECT_LOG_ARGS}
        DEPENDS         ${GTEST_DEPENDS}
        GIT_REPOSITORY  "https://github.com/google/googletest.git"
        GIT_TAG         "release-1.8.0"
        PREFIX          ${GTEST_SOURCES_DIR}
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -0,0 +1,72 @@
 # Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 IF(NOT ${WITH_MKLDNN})
  return()
 ENDIF(NOT ${WITH_MKLDNN})
 INCLUDE(ExternalProject)
 SET(MKLDNN_PROJECT        "extern_mkldnn")
 SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_ROOT   ${CMAKE_INSTALL_PREFIX})
 IF(NOT "$ENV{HOME}" STREQUAL "/root")
    SET(MKLDNN_INSTALL_ROOT  "$ENV{HOME}")
 ENDIF()
 SET(MKLDNN_INSTALL_DIR    "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
 SET(MKLDNN_INCLUDE_DIR    "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 IF(WIN32)
    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
      "Force WITH_MKLDNN=OFF")
    SET(WITH_MKLDNN OFF)
    return()
 ELSE(WIN32)
    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
    MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
 ENDIF()
 ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
    GIT_TAG             "v0.9"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
    BUILD_COMMAND       cd <SOURCE_DIR>/build
                        && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
                        && $(MAKE)
    INSTALL_COMMAND     cd <SOURCE_DIR>/build && $(MAKE) install
    UPDATE_COMMAND      ""
 )
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -0,0 +1,64 @@
 # Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
 INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
 SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
 SET(MKLML_INSTALL_ROOT  "${CMAKE_INSTALL_PREFIX}")
 IF(NOT "$ENV{HOME}" STREQUAL "/root")
    SET(MKLML_INSTALL_ROOT  "$ENV{HOME}")
 ENDIF()
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
 SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
 SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
 FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
                              "cmake_minimum_required(VERSION 3.0)\n"
                              "install(DIRECTORY ${MKLML_VER}\n"
                              "        DESTINATION ${MKLML_DST_DIR})\n")
 ExternalProject_Add(
    ${MKLML_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    PREFIX                ${MKLML_SOURCE_DIR}
    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
    DOWNLOAD_COMMAND      wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
    DOWNLOAD_NO_PROGRESS  1
    UPDATE_COMMAND        ""
    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} 
    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
 )
 ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
 LIST(APPEND external_project_dependencies mklml)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -124,6 +124,7 @@ set(GPU_COMMON_FLAGS
    -Wno-error=literal-suffix
    -Wno-error=unused-local-typedefs
    -Wno-error=unused-function  # Warnings in Numpy Header.
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
 if (APPLE)
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@ -80,10 +80,10 @@ func (p *EtcdClient) List() []Server {
 	for {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
 			cancel()
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			resp, err := p.client.Get(ctx, psKey)
 			cancel()
 			if err != nil {
 				log.Infof("Get psKey= %s error, %v", psKey, err)
 				time.Sleep(p.timeout)
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@ -61,25 +61,24 @@ struct EigenTensor {
  }
 };
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten is to reshape a Tensor into a one dimension EigenVector
+  // Flatten reshapes a Tensor into an EigenVector.
-  using Parent = EigenTensor<T, 1, MajorType, IndexType>;
+  static typename EigenVector::Type Flatten(Tensor& tensor) {
-  static typename Parent::Type Flatten(Tensor& tensor) {
+    return EigenVector::From(
-    return Parent::From(tensor,
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
                        make_ddim({static_cast<int>(product(tensor.dims_))}));
  }
-  static typename Parent::ConstType Flatten(const Tensor& tensor) {
+  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
-    return Parent::From(tensor,
+    return EigenVector::From(
-                        make_ddim({static_cast<int>(product(tensor.dims_))}));
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }
 };
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = EigenTensor<T, 2, MajorType, IndexType>;
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@ -39,19 +39,22 @@ void PlainNet::CompleteAddOp(bool calc) {
      output_set.insert(opt);
    }
  }
  inputs_.reserve(input_set.size());
  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
  std::sort(inputs_.begin(), inputs_.end());
  outputs_.reserve(output_set.size());
  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs_));
  std::sort(outputs_.begin(), outputs_.end());
  std::vector<int> tmp_index;
  tmp_index.reserve(temp_output.size());
-  int idx = 0;
+  int output_len = static_cast<int>(outputs_.size());
-  for (auto& opt : output_set) {
+  for (int i = 0; i < output_len; ++i) {
-    if (Contains(temp_output, opt)) {
+    if (Contains(temp_output, outputs_[i])) {
-      tmp_index.push_back(idx);
+      tmp_index.push_back(i);
    }
    outputs_.push_back(opt);
    ++idx;
  }
  attrs_["temporary_index"] = tmp_index;
@ -59,9 +62,12 @@ void PlainNet::CompleteAddOp(bool calc) {
 std::string PlainNet::DebugString() const {
  std::ostringstream os;
-  os << this->type_ << ":" << std::endl;
+  os << OperatorBase::DebugString() << std::endl;
  for (auto& op : ops_) {
-    os << "\t" << op->DebugString() << std::endl;
+    std::istringstream is(op->DebugString());
    for (std::string line; std::getline(is, line);) {
      os << "    " << line << std::endl;
    }
  }
  return os.str();
 }
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -48,25 +48,27 @@ class Tensor {
  template <typename T>
  const T* data() const {
-    CheckDims<T>();
+    EnforceSufficientMemory<T>();
    return reinterpret_cast<const T*>(
        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
  }
  template <typename T>
  T* data() {
-    CheckDims<T>();
+    EnforceSufficientMemory<T>();
    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                offset_);
  }
-  template <typename T>
+  template <typename T,  // must be POD types
            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
  T* mutable_data(DDim dims, platform::Place place) {
-    set_dims(dims);
+    Resize(dims);
    return mutable_data<T>(place);
  }
-  template <typename T>
+  template <typename T,  // must be POD types
            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
  T* mutable_data(platform::Place place) {
    PADDLE_ENFORCE(product(dims_) > 0,
                   "Tensor's numel must be larger than zero to call "
@ -95,11 +97,9 @@ class Tensor {
  }
  template <typename T>
-  void ShareDataFrom(const Tensor& src) {
+  void ShareDataWith(const Tensor& src) {
-    src.CheckDims<T>();
+    src.EnforceSufficientMemory<T>();
-    holder_ = src.holder_;
+    *this = src;
    set_dims(src.dims());
    offset_ = src.offset_;
  }
  template <typename T>
@ -107,9 +107,9 @@ class Tensor {
    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                       platform::is_cpu_place(dst_place),
                   "Tensor::CopyFrom only support CPU now.");
-    src.CheckDims<T>();
+    src.EnforceSufficientMemory<T>();
    size_t size = product(src.dims_) * sizeof(T);
-    set_dims(src.dims());
+    Resize(src.dims());
    const void* src_ptr = static_cast<const void*>(src.data<T>());
    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
    memcpy(dst_ptr, src_ptr, size);
@ -117,34 +117,25 @@ class Tensor {
  template <typename T>
  Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    CheckDims<T>();
+    EnforceSufficientMemory<T>();
-    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
+    PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
-                   "Slice index is less than zero or out of bound.");
+    PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
    PADDLE_ENFORCE(begin_idx < end_idx,
                   "Begin index must be less than end index.");
    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
-    std::vector<int> d = vectorize(dims_);
+    int base = product(dims_) / dims_[0];
    int base = 1;
    for (size_t i = 1; i < d.size(); ++i) {
      base *= d[i];
    }
    Tensor dst;
    dst.holder_ = holder_;
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
-    dst.set_dims(dst_dims);
+    dst.Resize(dst_dims);
    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
    return dst;
  }
-  void set_dims(const DDim& dims) {
+  void Resize(const DDim& dims) { dims_ = dims; }
    if (dims == dims_) {
      return;
    }
    dims_ = dims;
  }
-  DDim dims() const { return dims_; }
+  const DDim& dims() const { return dims_; }
 private:
  // Placeholder hides type T, so it doesn't appear as a template
@ -159,21 +150,9 @@ class Tensor {
  template <typename T, typename PlaceType>
  struct PlaceholderImpl : public Placeholder {
   private:
    template <typename PType>
    class Deleter {
     public:
      Deleter(PType place) : place_(place) {}
      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
     private:
      PType place_;
    };
   public:
    PlaceholderImpl(PlaceType place, size_t size)
        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               Deleter<PlaceType>(place)),
+               memory::PODDeleter<T, PlaceType>(place)),
          place_(place),
          size_(size) {}
@ -182,13 +161,13 @@ class Tensor {
    virtual paddle::platform::Place place() const { return place_; }
    virtual std::type_index type() const { return std::type_index(typeid(T)); }
-    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
+    std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
    platform::Place place_;  // record the place of ptr_.
    size_t size_;            // size of the memory block.
  };
  template <typename T>
-  inline void CheckDims() const {
+  inline void EnforceSufficientMemory() const {
    PADDLE_ENFORCE(holder_ != nullptr,
                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
@ -198,7 +177,11 @@ class Tensor {
  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
  DDim dims_;
-  size_t offset_;  // marks the begin of tensor data area.
+  // A PlaceHolder may be shared by more than one tensor. Some of them may be
  // slices of the others. So the offset_ is introduced here to indicate the
  // byte offset between PlaceHolder::ptr_ and where tensor's data really
  // begins.
  size_t offset_;
 };
 }  // namespace framework
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@ -19,7 +19,7 @@ TEST(Tensor, Dims) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  Tensor tt;
-  tt.set_dims(make_ddim({2, 3, 4}));
+  tt.Resize(make_ddim({2, 3, 4}));
  DDim dims = tt.dims();
  ASSERT_EQ(arity(dims), 3);
  for (int i = 0; i < 3; ++i) {
@ -97,7 +97,7 @@ TEST(Tensor, MutableData) {
 #endif
 }
-TEST(Tensor, ShareDataFrom) {
+TEST(Tensor, ShareDataWith) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  {
@ -106,7 +106,7 @@ TEST(Tensor, ShareDataFrom) {
    // Try to share data form uninitialized tensor
    bool caught = false;
    try {
-      dst_tensor.ShareDataFrom<float>(src_tensor);
+      dst_tensor.ShareDataWith<float>(src_tensor);
    } catch (std::runtime_error& err) {
      caught = true;
      std::string msg =
@ -119,7 +119,7 @@ TEST(Tensor, ShareDataFrom) {
    ASSERT_TRUE(caught);
    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataFrom<int>(src_tensor);
+    dst_tensor.ShareDataWith<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
@ -128,7 +128,7 @@ TEST(Tensor, ShareDataFrom) {
    Tensor src_tensor;
    Tensor dst_tensor;
    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom<int>(src_tensor);
+    dst_tensor.ShareDataWith<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
 #endif
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@ -31,13 +31,22 @@ public:
  ConvolutionTest(const std::string& conv1,
                  const std::string& conv2,
                  TestType type,
                  bool useGroups = true,
                  std::string algo = "auto") {
    for (size_t batchSize : {1, 32}) {
      for (size_t inputSize : {7, 14, 54}) {
        for (size_t filterSize : {1, 3, 5}) {
          for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64, 128}) {
+            for (size_t outputChannels : {3, 64}) {
-              if (inputChannels < outputChannels) break;
+              if (inputChannels > outputChannels) break;
              size_t groups;
              if (!useGroups) {
                groups = 1;
              } else {
                if (outputChannels % inputChannels != 0) continue;
                groups = inputChannels;
              }
              for (size_t stride : {1, 2}) {
                for (size_t padding : {0, 1}) {
                  if (padding >= filterSize) break;
@ -62,13 +71,24 @@ public:
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                          .set("algo", algo));
                  TensorShape input{
                      batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape filter{
+
-                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape filter;
                  if (groups > 1)
                    filter = TensorShape({groups,
                                          outputChannels / groups,
                                          inputChannels / groups,
                                          filterSize,
                                          filterSize});
                  else
                    filter = TensorShape({outputChannels,
                                          inputChannels,
                                          filterSize,
                                          filterSize});
                  TensorShape output{
                      batchSize, outputChannels, outputSize, outputSize};
@ -85,7 +105,8 @@ public:
                  } else if (type == kBackwardFilterTest) {
                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
                                    ADD_TO);
                    test.run();
                  }
                }
@ -106,6 +127,7 @@ public:
  ConvolutionTest2(const std::string& conv1,
                   const std::string& conv2,
                   TestType type,
                   bool useGroups = true,
                   std::string algo = "auto") {
    for (size_t batchSize : {16}) {
      for (size_t inputHeight : {7, 31}) {
@ -113,7 +135,15 @@ public:
          for (size_t filterHeight : {1, 5}) {
            for (size_t filterWidth : {3, 7}) {
              for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {32}) {
+                for (size_t outputChannels : {7}) {
                  size_t groups;
                  if (!useGroups) {
                    groups = 1;
                  } else {
                    if (outputChannels % inputChannels != 0) continue;
                    groups = inputChannels;
                  }
                  size_t stride = 1;
                  size_t padding = 0;
                  size_t outputHeight =
@ -141,13 +171,24 @@ public:
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                          .set("algo", algo));
                  TensorShape input{
                      batchSize, inputChannels, inputHeight, inputWidth};
-                  TensorShape filter{
+
-                      outputChannels, inputChannels, filterHeight, filterWidth};
+                  TensorShape filter;
                  if (groups > 1)
                    filter = TensorShape({groups,
                                          outputChannels / groups,
                                          inputChannels / groups,
                                          filterHeight,
                                          filterWidth});
                  else
                    filter = TensorShape({outputChannels,
                                          inputChannels,
                                          filterHeight,
                                          filterWidth});
                  TensorShape output{
                      batchSize, outputChannels, outputHeight, outputWidth};
@ -164,7 +205,8 @@ public:
                  } else if (type == kBackwardFilterTest) {
                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
                                    ADD_TO);
                    test.run();
                  }
                }
@ -177,34 +219,88 @@ public:
  }
 };
 // ======Start Convolution TEST======
 TEST(Forward, GEMM) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
 }
 #ifndef PADDLE_ONLY_CPU
 TEST(Forward, GEMM2) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
 }
 TEST(BackwardInput, GEMM) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
      "GemmConvGradInput-GPU",
      kBackwardInputTest,
      false);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
      "GemmConvGradInput-GPU",
      kBackwardInputTest,
      false);
 }
 TEST(BackwardFilter, GEMM) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
      "GemmConvGradFilter-GPU",
      kBackwardFilterTest,
      false);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
      "GemmConvGradFilter-GPU",
      kBackwardFilterTest,
      false);
 }
 #endif
 // ======End Convolution TEST======
 // ======Start DepthwiseConvolution TEST======
 // TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
 // version of depthwiseConv is implemented.
 #ifndef PADDLE_ONLY_CPU
 TEST(DepthwiseConvForward, GEMM2) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
 }
 TEST(DepthwiseConvBackwardInput, GEMM) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
      "GemmConvGradInput-CPU",
      "DepthwiseConvGradInput-GPU",
      kBackwardInputTest);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
      "GemmConvGradInput-CPU",
      "DepthwiseConvGradInput-GPU",
      kBackwardInputTest);
 }
 TEST(DepthwiseConvBackwardFilter, GEMM) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
      "GemmConvGradFilter-CPU",
      "DepthwiseConvGradFilter-GPU",
      kBackwardFilterTest);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
      "GemmConvGradFilter-CPU",
      "DepthwiseConvGradFilter-GPU",
      kBackwardFilterTest);
 }
 #endif
 // ======End DepthwiseConvolution TEST======
 }  // namespace paddle
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@ -0,0 +1,159 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "TensorType.h"
 namespace paddle {
 /**
 *\brief   Depthwise convolution forward. The outputData
 *         of depthwise convolution is same with ExpandConvLayer
 *         when groups equals inputChannels in ExpandConvLayer.
 *
 * \param[in]   inputData         input data.
 * \param[in]   filterData        the Paramters of the depthwise conv layer..
 * \param[in]   batchSize         batch size of input data.
 * \param[in]   outputChannels    channels of outputData.
 * \param[in]   outputHeight      height of outputData.
 * \param[in]   outputWidth       width of outputData.
 * \param[in]   inputChannels     channels of inputData.
 * \param[in]   inputHeight       height of inputData.
 * \param[in]   inputWidth        width of inputData..
 * \param[in]   filterMultiplier  equals to outputChannels/groups_.
 * \param[in]   filterHeight      height of filter.
 * \param[in]   filterWidth       widht of filter.
 * \param[in]   strideH           stride size in height direction.
 * \param[in]   strideW           stride size in width direction.
 * \param[in]   paddingH          padding size in height direction.
 * \param[in]   paddingW          padding size in width direction.
 * \param[out]  outputData        outputData.
 *
 */
 template <DeviceType Device, class T>
 class DepthwiseConvFunctor {
 public:
  void operator()(const T* inputData,
                  const T* filterData,
                  int batchSize,
                  int outputChannels,
                  int outputHeight,
                  int outputWidth,
                  int inputChannels,
                  int inputHeight,
                  int inputWidth,
                  int filterMultiplier,
                  int filterHeight,
                  int filterWidth,
                  int strideH,
                  int strideW,
                  int paddingH,
                  int paddingW,
                  T* outputData);
 };
 /**
 *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
 *
 *
 * \param[in]   outputGradData    the grad data of output.
 * \param[in]   filterData        the Paramters of the depthwise conv layer..
 * \param[in]   batchSize         batch size of input data.
 * \param[in]   outputChannels    channels of outputData.
 * \param[in]   outputHeight      height of outputData.
 * \param[in]   outputWidth       width of outputData.
 * \param[in]   inputChannels     channels of input data.
 * \param[in]   inputHeight       height of inputData.
 * \param[in]   inputWidth        width of inputData.
 * \param[in]   filterMultiplier  equals to outputChannels/groups_.
 * \param[in]   filterHeight      height of filter.
 * \param[in]   filterWidth       widht of filter.
 * \param[in]   strideH           stride size in height direction.
 * \param[in]   strideW           stride size in width direction.
 * \param[in]   paddingH          padding size in height direction.
 * \param[in]   paddingW          padding size in width direction.
 * \param[out]  inputGrad         the grad data of input.
 *
 */
 template <DeviceType Device, class T>
 class DepthwiseConvGradInputFunctor {
 public:
  void operator()(const T* outputGrad,
                  const T* filterData,
                  int batchSize,
                  int outputChannels,
                  int outputHeight,
                  int outputWidth,
                  int inputChannels,
                  int inputHeight,
                  int inputWidth,
                  int filterMultiplier,
                  int filterHeight,
                  int filterWidth,
                  int strideH,
                  int strideW,
                  int paddingH,
                  int paddingW,
                  T* inputGrad);
 };
 /**
 *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
 *
 * \param[in]   outputGradData    the grad data of output.
 * \param[in]   inputData         inputData.
 * \param[in]   batchSize         batch size of input data.
 * \param[in]   outputChannels    channels of outputData.
 * \param[in]   outputHeight      height of outputData.
 * \param[in]   outputWidth       width of outputData.
 * \param[in]   inputChannels     channels of input data.
 * \param[in]   inputHeight       height of inputData.
 * \param[in]   inputWidth        width of inputData.
 * \param[in]   filterMultiplier  equals to outputChannels/groups_.
 * \param[in]   filterHeight      height of filter.
 * \param[in]   filterWidth       widht of filter.
 * \param[in]   strideH           stride size in height direction.
 * \param[in]   strideW           stride size in width direction.
 * \param[in]   paddingH          padding size in height direction.
 * \param[in]   paddingW          padding size in width direction.
 * \param[in]   colData           Auxiliary data when calculating filterGrad.
 * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
 * \param[out]  filterGrad        the grad data of filter.
 *
 */
 template <DeviceType Device, class T>
 class DepthwiseConvGradFilterFunctor {
 public:
  void operator()(const T* outputGrad,
                  const T* inputData,
                  int batchSize,
                  int outputChannels,
                  int outputHeight,
                  int outputWidth,
                  int inputChannels,
                  int inputHeight,
                  int inputWidth,
                  int filterMultiplier,
                  int filterHeight,
                  int filterWidth,
                  int strideH,
                  int strideW,
                  int paddingH,
                  int paddingW,
                  T* colData,
                  T* filterGrad);
 };
 }  // namespace paddle
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@ -38,10 +38,25 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
  inputShape_.resize(numInputs);
  filterShape_.resize(numInputs);
  outputShape_.resize(numInputs);
  std::string convType;
  std::string convGradInputType;
  std::string convGradFilterType;
  for (int i = 0; i < config_.inputs_size(); i++) {
    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
    if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
      convType = "DepthwiseConv";
      convGradInputType = "DepthwiseConvGradInput";
      convGradFilterType = "DepthwiseConvGradFilter";
    } else {
      convType = "GemmConv";
      convGradInputType = "GemmConvGradInput";
      convGradFilterType = "GemmConvGradFilter";
    }
    if (FLAGS_use_nnpack) {
      CHECK_EQ(isDeconv_, false);
      createFunction(forward_,
@ -53,21 +68,21 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                         .set("algo", std::string("auto")));
    } else {
      createFunction(forward_,
-                     !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                     !isDeconv_ ? convType : convGradInputType,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
                         .set("groups", (size_t)groups_[i]));
      createFunction(backward_,
-                     !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                     !isDeconv_ ? convGradInputType : convType,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
                         .set("groups", (size_t)groups_[i]));
      createFunction(backward_,
-                     "GemmConvGradFilter",
+                     convGradFilterType,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -347,6 +347,55 @@ TEST(Layer, CosSimVecMatLayer) {
  }
 }
 void testDepthwiseConvLayer(const string& type, bool useGpu) {
  TestConfig config;
  config.biasSize = 32;
  config.layerConfig.set_type(type);
  config.layerConfig.set_num_filters(32);
  config.layerConfig.set_partial_sum(1);
  config.layerConfig.set_shared_biases(true);
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  ConvConfig* conv = input->mutable_conv_conf();
  conv->set_filter_size(2);
  conv->set_filter_size_y(3);
  conv->set_channels(16);
  conv->set_padding(0);
  conv->set_padding_y(1);
  conv->set_stride(2);
  conv->set_stride_y(2);
  conv->set_groups(16);
  conv->set_filter_channels(conv->channels() / conv->groups());
  conv->set_img_size(16);
  conv->set_img_size_y(8);
  conv->set_output_x(outputSize(conv->img_size(),
                                conv->filter_size(),
                                conv->padding(),
                                conv->stride(),
                                /* caffeMode */ true));
  conv->set_output_y(outputSize(conv->img_size_y(),
                                conv->filter_size_y(),
                                conv->padding_y(),
                                conv->stride_y(),
                                /* caffeMode */ true));
  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                              config.layerConfig.num_filters());
  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
  // Use small batch_size and useWeight=true to test biasGrad
  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
 }
 TEST(Layer, depthwiseConvLayer) {
  //  'depthwise_conv' is a sepecial case of 'exconv' whose
  //  groups size equals to the input channels size.
  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
 #ifndef PADDLE_ONLY_CPU
  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
 #endif
 }
 void testConvLayer(const string& type, bool trans, bool useGpu) {
  TestConfig config;
  config.biasSize = 16;
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
  return cblas_ddot(n, x, 1, y, 1);
 }
-#ifdef PADDLE_USE_MKL
+#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@ -243,7 +243,55 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
  vdAdd(n, a, b, r);
 }
 #else
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
 template <class T>
 void vExp(const int n, const T* a, T* r) {
  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
 template <class T>
 void vLog(const int n, const T* a, T* r) {
  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
 template <class T>
 void vPow(const int n, const T* a, const T b, T* r) {
  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
 }
 DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
 template <class T>
 void vAdd(const int n, const T* a, const T* b, T* r) {
  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
                                                     const_cast<T*>(a),
                                                     const_cast<T*>(b),
                                                     r,
                                                     1,
                                                     n,
                                                     n,
                                                     n,
                                                     n);
 }
 template void vExp(const int n, const float* a, float* r);
 template void vExp(const int n, const double* a, double* r);
 template void vLog(const int n, const float* a, float* r);
 template void vLog(const int n, const double* a, double* r);
 template void vPow(const int n, const float* a, const float b, float* r);
 template void vPow(const int n, const double* a, const double b, double* r);
 template void vAdd(const int n, const float* a, const float* b, float* r);
 template void vAdd(const int n, const double* a, const double* b, double* r);
 #endif
 #ifdef PADDLE_USE_MKL
 template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
  vsInvSqrt(n, a, r);
@ -275,20 +323,6 @@ void vTanh<double>(const int n, const double* a, double* r) {
 }
 #else
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
 template <class T>
 void vExp(const int n, const T* a, T* r) {
  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
 template <class T>
 void vLog(const int n, const T* a, T* r) {
  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@ -312,41 +346,12 @@ void vTanh(const int n, const T* a, T* r) {
      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
 template <class T>
 void vPow(const int n, const T* a, const T b, T* r) {
  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
 }
 DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
 template <class T>
 void vAdd(const int n, const T* a, const T* b, T* r) {
  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
                                                     const_cast<T*>(a),
                                                     const_cast<T*>(b),
                                                     r,
                                                     1,
                                                     n,
                                                     n,
                                                     n,
                                                     n);
 }
 template void vExp(const int n, const float* a, float* r);
 template void vExp(const int n, const double* a, double* r);
 template void vLog(const int n, const float* a, float* r);
 template void vLog(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
 template void vPow(const int n, const float* a, const float b, float* r);
 template void vPow(const int n, const double* a, const double b, double* r);
 template void vAdd(const int n, const float* a, const float* b, float* r);
 template void vAdd(const int n, const double* a, const double* b, double* r);
 #endif
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@ -15,6 +15,12 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 #ifdef PADDLE_USE_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>
 #endif
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
 #include <mkl_lapacke.h>
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@ -1,11 +1,16 @@
 add_subdirectory(detail)
 cc_library(memory SRCS memory.cc)
 cc_library(memcpy SRCS memcpy.cc DEPS device_context)
 cc_library(paddle_memory
    DEPS
-    memory meta_data
+    memory
-    meta_cache memory_block
+    memcpy
-    buddy_allocator system_allocator)
+    meta_data
    meta_cache
    memory_block
    buddy_allocator
    system_allocator)
 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@ -1,140 +1,4 @@
-## Design
+# Region-based Heterogeneous Memory Management
-### Usage
+Please check out the [design documentation](http://gangliao.me) to find out more details about
-
+buddy memory allocator for both CPU and GPU.
 To allocate 4KB CPU memory:
 ```cpp
 p = memory::Alloc(platform::CPUPlace(), 4*1024);
 ```
 To allocate 4KB memory on the 3rd GPU:
 ```cpp
 p = memory::Alloc(platform::GPUPlace(2), 4*1024);
 ```
 To free memory and check the so-far used amount of memory on a place:
 ```cpp
 auto pl = platform::GPUPlace(0);
 p = memory::Alloc(pl, 4*1024);
 cout << memory::Used(pl);
 memory::Free(pl, p);
 ```
 ### API
 In `paddle/memory/memory.h` we have:
 ```cpp
 namespace memory {
 template <typename Place> void* Alloc(Place, size_t);
 template <typename Place> void Free(Place, void*);
 template <typename Place> size_t Used(Place);
 }  // namespace memory
 ```
 These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
 ```cpp
 template<>
 void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
  return GetCPUBuddyAllocator()->Alloc(size);
 }
 ```
 and 
 ```cpp
 template<>
 void Alloc<GPUPlace>(GPUPlace p, size_t size) {
  return GetGPUBuddyAllocator(p.id)->Alloc(size);
 }
 ```
 Similar specializations exist for `Free` and `Used`.
 ### Implementation
 `GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
 ```cpp
 BuddyAllocator* GetCPUBuddyAllocator() {
  static BuddyAllocator* a = NULL;
  if (a == NULL) {
    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
  }
  return a;
 }
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  static BuddyAllocator* as = NULL;
  if (as == NULL) {
    as = new BuddyAllocator*[platform::NumGPUs()];
    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
    }
  }
  return as[gpu_id);
 ```
 #### `BuddyAllocator`
 `BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
 ```cpp
 BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
  ...
 }
 ```
 Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
 ```cpp
 class BuddyAllocator {
 private:
  struct Block {
    size_t size;
    Block* left, right;
    size_t index; // allocator id
  };
  ...
 };
 ```
 Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
 #### System Allocators
 The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
 ## Justification
 I got inspiration from Majel and Caffe2, though above design look different from both.
 ### Caffe2
 In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
 There are two implementations of `Context`:
 1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
 1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
 ### Majel
 In Majel, there are basically two allocator types:
 1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
 1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
 However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
 In Majel there are hidden global variables like:
 1. `cpu::SystemAllocator g_cpu_allocator`, and
 1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
 Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@ -0,0 +1,70 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/memory/memcpy.h"
 #include <cstring>  // for memcpy
 #include "paddle/platform/device_context.h"
 namespace paddle {
 namespace memory {
 template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                  platform::CPUPlace,
                                                  const void* src, size_t num) {
  std::memcpy(dst, src, num);
 }
 #ifndef PADDLE_ONLY_CPU
 template <>
 void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
                                                  void* dst,
                                                  platform::GPUPlace src_place,
                                                  const void* src, size_t num,
                                                  cudaStream_t stream) {
  platform::GPUPlaceGuard g(src_place.device);
  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 template <>
 void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
                                                  void* dst,
                                                  platform::CPUPlace src_place,
                                                  const void* src, size_t num,
                                                  cudaStream_t stream) {
  platform::GPUPlaceGuard g(dst_place.device);
  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 template <>
 void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
                                                  void* dst,
                                                  platform::GPUPlace src_place,
                                                  const void* src, size_t num,
                                                  cudaStream_t stream) {
  if (dst_place == src_place) {
    platform::GPUPlaceGuard g(src_place.device);
    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
  } else {
    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
                            stream);
  }
 }
 #endif  // PADDLE_ONLY_CPU
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
@ -0,0 +1,33 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 namespace paddle {
 namespace memory {
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 #ifndef PADDLE_ONLY_CPU
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
          cudaStream_t stream);
 #endif  // PADDLE_ONLY_CPU
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@ -15,7 +15,8 @@ limitations under the License. */
 #include "paddle/memory/memory.h"
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
+
 #include <cstring>  // for memcpy
 namespace paddle {
 namespace memory {
--- a/Show More
+++ b/Show More