Update code and small fix.

8 years ago · a1bf4c2833
parent 5e7e90ce8f 21053c168c
commit a1bf4c2833
90 changed files with 2602 additions and 811 deletions
--- a/.clang-format
+++ b/.clang-format
@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
-
--- a/.travis.yml
+++ b/.travis.yml
@ -42,7 +42,7 @@ before_install:
 script:
  - |
    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
  - |
    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -133,6 +133,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
+include(external/cares)
+include(external/grpc)

 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
--- a/2
+++ b/2
@ -29,7 +29,7 @@ RUN apt-get update && \
    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
    apt-get clean -y

 # Install Go and glide
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make
+    INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make)
+ENDIF()
+
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.7.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)

 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
    function(protobuf_generate_python SRCS)
@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
    # make `protobuf_generate_cpp` happy.
    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
    FOREACH(dep ${protobuf_DEPS})
        ADD_DEPENDENCIES(protobuf ${dep})
        ADD_DEPENDENCIES(protobuf_lite ${dep})
@ -128,11 +138,11 @@ endmacro()

 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
        SET_PROTOBUF_VERSION()
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@ -50,6 +50,8 @@ ExternalProject_Add(
 )

 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})

 IF(WITH_C_API)
  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -459,11 +459,58 @@ function(py_test TARGET_NAME)
  if(WITH_TESTING)
    set(options STATIC static SHARED shared)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             ${PYTHON_EXECUTABLE} ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -54,7 +54,7 @@ img_conv

 ..  _api_v2.layer_context_projection:

-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
    :noindex:
@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:

 spp
 ---
@ -104,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
    :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@ -114,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
    :noindex:
-    
+
 Recurrent Layers
 ================

@ -415,6 +415,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:

+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:

 Slicing and Joining Layers
 ==========================
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }

 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (ptr->mat == nullptr) return kPD_NULLPTR;
@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }

 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (ptr->mat == nullptr) return kPD_NULLPTR;
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
 * @note  value should contain enough element of data to init the mat
 */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);

 /**
 * @brief PDMatGetRow Get raw row buffer from matrix
@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                          paddle_real** rawRowBuffer);

 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
 * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
 * @return paddle_error
 * @note the space of the result should allocated before invoke this API
 */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
 * @brief PDMatCreateNone Create None Matrix
 * @return
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@ -13,6 +13,8 @@
   limitations under the License. */

 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"

 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@ -27,11 +29,11 @@
 namespace paddle {
 namespace framework {

-std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
  os << "{";
-  for (auto& v : lod) {
+  for (auto &v : lod) {
    os << "{";
-    for (auto& i : v) {
+    for (auto &i : v) {
      os << i << ",";
    }
    os << "}";
@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
  return os;
 }

-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
  LoD new_lod;
  new_lod.reserve(level_end - level_begin);
  for (size_t i = level_begin; i < level_end; i++) {
@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
  return new_lod;
 }

-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                 size_t elem_end) {
  PADDLE_ENFORCE_LT(level, in.size());
  PADDLE_ENFORCE_LT(elem_end, in[level].size());
@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
  res[0].assign(in[level].begin() + elem_begin,
                in[level].begin() + elem_end + 1);
  for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto& in_level = in[level + lvl];
-    const auto& above_level = res[lvl - 1];
-    auto& out_level = res[lvl];
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
    out_level.assign(in_level.begin() + above_level.front(),
                     in_level.begin() + above_level.back() + 1);
  }
@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
    // to make the first offset equals 0, all the elements minus the first
    // element
    size_t front = res[lvl].front();
-    for (auto& ele : res[lvl]) {
+    for (auto &ele : res[lvl]) {
      ele -= front;
    }
  }
  return res;
 }

-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
  // the lowest level stores relative offsets
  if (in.empty() || in.size() == 1) return in;
  LoD result = in;
  for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto& ele : result[level]) {
+    for (auto &ele : result[level]) {
      ele = result[level + 1][ele];
    }
  }
  return result;
 }

-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
  if (a.size() != b.size()) {
    return false;
  }

  for (size_t i = 0; i < a.size(); i++) {
-    const auto& a_level = a[i];
-    const auto& b_level = b[i];
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
    if (a_level.size() != b_level.size()) {
      return false;
    }
@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
 }

 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                        size_t end_idx, size_t start_level) {
  LoD sub_lod;

@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }

-void AppendLoD(LoD* lod, const LoD& lod_length) {
+void AppendLoD(LoD *lod, const LoD &lod_length) {
  PADDLE_ENFORCE(
      lod->empty() || lod->size() == lod_length.size(),
      "The lod_length should has the same size with the appended lod.");
@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
  }
  for (size_t i = 0; i < lod->size(); ++i) {
-    auto& level = (*lod)[i];
+    auto &level = (*lod)[i];
    for (size_t len : lod_length[i]) {
      level.push_back(level.back() + len);
    }
  }
 }

+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    framework::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto *pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto *data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto &gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::GPUPlace>(tensor.place()),
+                     reinterpret_cast<const void *>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char *>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  {  // the 4th field, lod information
+     // uint64_t lod_level
+     // uint64_t lod_level_1 size in byte.
+     // int*     lod_level_1 data
+     // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  framework::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void *buf;
+    platform::Place cpu = platform::CPUPlace();
+    switch (desc.data_type()) {
+      case framework::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case framework::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case framework::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case framework::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char *>(buf), tensor->memory_size());
+  }
+  {  // read lod
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@ -189,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(

 void AppendLoD(LoD* lod, const LoD& lod_length);

+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@ -135,18 +135,17 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
  auto dst_ptr = static_cast<void*>(dst->data());

  if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()), 
-		    src_ptr, size);
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
  }
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(src.place())) {  // NOLINT
    memory::Copy(
-        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()), src_ptr,
-        size,
+        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
-
 }

 }  // namespace framework
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                         batchSize,
                         codeLength_,
                         /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
  Matrix::resizeOrCreate(preOutput_.grad,
                         batchSize,
                         codeLength_,
                         /* trans */ false,
-                         useGpu(deviceId_));
-
+                         false);
  IVectorPtr label = getInput(*getLabelLayer()).ids;
-
  preOutput_.value->zeroMem();

+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
  /* add the bias-vector */
  if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
  }
  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
    MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
    preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
  }
  // keep consistent with the clipping in the following softrelu
  preOutput_.value->clip(-40.0, 40.0);
  preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
-                                 *output_.value,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
                                 -1);  // scaleSum
  preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
  preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }

 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
  IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
  preOutput_.grad->one();
  preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);

  if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *label, *biases_->getWGrad());
-
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);
  }
@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
    /* Calculate the W-gradient for the current layer */
    MatrixPtr input = getInputValue(i);
    if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
      preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
      /* Increasing the number of gradient */
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }
@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
    /* Calculate the input layers error */
    MatrixPtr inputGrad = getInputGrad(i);
    if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
      preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
    }
  }
 }
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@ -80,6 +80,15 @@ protected:
  int codeLength_;
  /// temporary result of output_
  Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };

 }  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
  config.layerConfig.add_inputs();
  config.layerConfig.add_inputs();

-  // Not support GPU now
-  testLayerGrad(config,
-                "hsigmoid",
-                100,
-                /* trans */ false, /* useGpu */
-                false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
 }

 TEST(Layer, multi_cross) {
@ -2464,6 +2465,25 @@ TEST(Layer, L2DistanceLayer) {
  }
 }

+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
  os << ";";
 }

+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
  CHECK_LE(elementCnt_, height_ * width_);
  if (valueType_ == FLOAT_VALUE) {
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@ -239,6 +239,15 @@ public:
              const unsigned int* cols,
              const real* values);

+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
  void randomizeUniform();

  void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -205,8 +205,24 @@ set(DEPS_OPS
    tensor_array_read_write_op
    gru_op
    adagrad_op
-    sgd_op)
+    sgd_op
+    save_op
+    load_op
+    send_op
+    recv_op)

+add_subdirectory(detail)
+op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    send_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    recv_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")

 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@ -235,6 +251,10 @@ op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)

+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
    op_library(${src})
@ -242,6 +262,8 @@ endforeach()

 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")

+
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
@ -251,3 +273,4 @@ if(WITH_GPU)
  cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
    const auto x_dims = ctx->GetInputDim("X");
    const TensorFormat tensor_format =
        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
    const int C =
        (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                             : x_dims[x_dims.size() - 1]);

-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input X must have 3 to 5 dimensions.");
-
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {

    const auto *x = ctx.Input<Tensor>("X");
    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
    const int N = x_dims[0];
    const int C =
        (tensor_format == TensorFormat::NCHW ? x_dims[1]
@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
    // Get the size for each dimension.
    // NCHW [batch_size, in_channels, in_height, in_width]
    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
    const int N = x_dims[0];
    const int C =
        (tensor_format == TensorFormat::NCHW ? x_dims[1]
--- a/Show More
+++ b/Show More