Merge branch 'develop' into cmake

7 years ago · 57157284b2
parent c864161532 9b761edc95
commit 57157284b2
58 changed files with 1232 additions and 214 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -54,6 +54,7 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)

 # CMAKE_BUILD_TYPE
@ -67,9 +68,6 @@ if(ANDROID OR IOS)
    if(ANDROID)
        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-            # TODO: support glog for Android api 16 ~ 19 in the future
-            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
        endif()
    endif()

@ -83,6 +81,8 @@ if(ANDROID OR IOS)
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKL OFF CACHE STRING
        "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_GOLANG OFF CACHE STRING
+        "Disable golang when cross-compiling for Android and iOS" FORCE)

    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@ -6,10 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+is_infer = get_config_arg("is_infer", bool, False)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)

 settings(
    batch_size=batch_size,
@ -146,7 +157,6 @@ def inception(name, input, channels, \
    return cat


-lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)

 # stage 1
@ -224,6 +234,10 @@ pool5 = img_pool_layer(
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
 out3 = fc_layer(
    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-loss3 = cross_entropy(name='loss3', input=out3, label=lab)

-outputs(loss3)
+if is_infer:
+    outputs(out3)
+else:
+    lab = data_layer(name="label", size=num_class)
+    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+    outputs(loss3)
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs):
        settings.data_size = settings.height * settings.width * 3
    else:
        settings.data_size = settings.height * settings.width
-
-    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+    settings.is_infer = kwargs.get('is_infer', False)
+    if settings.is_infer:
+        settings.slots = [dense_vector(settings.data_size)]
+    else:
+        settings.slots = [dense_vector(settings.data_size), integer_value(1)]


@provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(1024):
+    for i in xrange(2560 if settings.is_infer else 1024):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class - 1)
-        yield img.astype('float32'), int(lab)
+        if settings.is_infer:
+            yield img.astype('float32')
+        else:
+            lab = random.randint(0, settings.num_class - 1)
+            yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@ -6,11 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
-is_test = get_config_arg("is_test", bool, False)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+is_infer = get_config_arg("is_infer", bool, False)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)

 settings(
    batch_size=batch_size,
@ -45,7 +55,10 @@ def conv_bn_layer(name,
        act=LinearActivation(),
        bias_attr=False)
    return batch_norm_layer(
-        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+        name=name + "_bn",
+        input=tmp,
+        act=active_type,
+        use_global_stats=is_infer)


 def bottleneck_block(name, input, num_filters1, num_filters2):
@ -207,7 +220,9 @@ elif layer_num == 152:
 else:
    print("Wrong layer number.")

-lbl = data_layer(name="label", size=num_class)
-loss = cross_entropy(name='loss', input=resnet, label=lbl)
-inputs(img, lbl)
-outputs(loss)
+if is_infer:
+    outputs(resnet)
+else:
+    lbl = data_layer(name="label", size=num_class)
+    loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(loss)
--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
@ -0,0 +1,86 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `bc -l <<< "$secs + $mins * 60 + $hours * 3600"`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    if [ $thread -gt $bs ]; then
+      thread=$bs
+    fi
+    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "Training model ${topology}_${layer_num}"
+    paddle train --job=train \
+      --config="${topology}.py" \
+      --use_mkldnn=True \
+      --use_gpu=False \
+      --trainer_count=1 \
+      --num_passes=1 \
+      --save_dir="models/${topology}-${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num}" \
+      > /dev/null 2>&1
+    echo "Done"
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`bc <<< "scale = 2; 1280 / ($end_sec - $start_sec)"`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" >> ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+if [ ! -d "models" ]; then
+  mkdir -p models
+fi
+
+# inference benchmark
+for use_mkldnn in True False; do
+  for batchsize in 1 2 4 8 16; do
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer vgg 19 $batchsize $use_mkldnn
+  done
+done
--- a/benchmark/paddle/image/run_mkldnn_train.sh
+++ b/benchmark/paddle/image/run_mkldnn_train.sh
@ -8,13 +8,13 @@ function train() {
  use_mkldnn=$4
  if [ $4 == "True" ]; then
    thread=1
-    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
  elif [ $4 == "False" ]; then
    thread=`nproc`
    # each trainer_count use only 1 core to avoid conflict
-    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
  else
-    echo "Wrong input $3, use True or False."
+    echo "Wrong input $4, use True or False."
    exit 0
  fi
  args="batch_size=${bs},layer_num=${layer_num}"
@ -30,13 +30,14 @@ function train() {
    2>&1 | tee ${log} 
 }

-if [ ! -d "train.list" ]; then
+if [ ! -f "train.list" ]; then
  echo " " > train.list
 fi
 if [ ! -d "logs" ]; then
  mkdir logs
 fi

+# training benchmark
 for use_mkldnn in True False; do
  for batchsize in 64 128 256; do
    train vgg 19 $batchsize $use_mkldnn
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -6,10 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
+is_infer = get_config_arg("is_infer", bool, False)

-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)

 settings(
    batch_size=batch_size,
@ -98,6 +109,9 @@ elif layer_num == 19:
 else:
    print("Wrong layer number.")

-lab = data_layer('label', num_class)
-loss = cross_entropy(input=vgg, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(vgg)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=vgg, label=lab)
+    outputs(loss)
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@ -13,7 +13,7 @@
 # limitations under the License.
 #

-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
    return()
 ENDIF()

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -26,12 +26,21 @@ ENDIF(WIN32)

 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})

+IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+  # Using the unofficial glog for Android API < 21
+  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
+  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
+ELSE()
+  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+  SET(GLOG_TAG "v0.3.5")
+ENDIF()
+
 ExternalProject_Add(
    extern_glog
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
-    GIT_REPOSITORY  "https://github.com/google/glog.git"
-    GIT_TAG         v0.3.5
+    GIT_REPOSITORY  ${GLOG_REPOSITORY}
+    GIT_TAG         ${GLOG_TAG}
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -13,7 +13,7 @@
 # limitations under the License.
 #

-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
    return()
 ENDIF()

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -188,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
    ENDIF()

+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    IF(MOBILE_INFERENCE)
+        # The reason why the official version is not used is described in
+        # https://github.com/PaddlePaddle/Paddle/issues/6114
+        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
+        SET(PROTOBUF_TAG "v3.2.0")
+        IF(NOT BUILD_FOR_HOST)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
+        ENDIF()
+    ENDIF()
+
    ExternalProject_Add(
        ${TARGET_NAME}
        ${EXTERNAL_PROJECT_LOG_ARGS}
        PREFIX          ${PROTOBUF_SOURCES_DIR}
        UPDATE_COMMAND  ""
        DEPENDS         zlib
-        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
-        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        GIT_REPOSITORY  ${PROTOBUF_REPO}
+        GIT_TAG         ${PROTOBUF_TAG}
        CONFIGURE_COMMAND
        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
            ${OPTIONAL_ARGS}
@ -213,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    )
 ENDFUNCTION()

-SET(PROTOBUF_VERSION 3.1)
+IF(NOT MOBILE_INFERENCE)
+    SET(PROTOBUF_VERSION 3.1)
+ELSE()
+    SET(PROTOBUF_VERSION 3.2)
+ENDIF()
 IF(CMAKE_CROSSCOMPILING)
    build_protobuf(protobuf_host TRUE)
    LIST(APPEND external_project_dependencies protobuf_host)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -111,6 +111,8 @@ set(COMMON_FLAGS
    -Wno-error=sign-compare
    -Wno-error=unused-local-typedefs
    -Wno-error=parentheses-equality # Warnings in pybind11
+    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
+    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )

 set(GPU_COMMON_FLAGS
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)
@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(nv_test)
@ -505,12 +505,12 @@ function(grpc_library TARGET_NAME)
  set_source_files_properties(
    ${grpc_grpc_srcs}
    PROPERTIES
-    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")

  set_source_files_properties(
    ${grpc_library_SRCS}
    PROPERTIES
-    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
 endfunction()
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
--- a/doc/design/mkldnn/image/engine.png
+++ b/doc/design/mkldnn/image/engine.png
--- a/doc/design/mkldnn/image/gradients.png
+++ b/doc/design/mkldnn/image/gradients.png
--- a/doc/design/mkldnn/image/layers.png
+++ b/doc/design/mkldnn/image/layers.png
--- a/doc/design/mkldnn/image/matrix.png
+++ b/doc/design/mkldnn/image/matrix.png
--- a/doc/design/mkldnn/image/overview.png
+++ b/doc/design/mkldnn/image/overview.png
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
@ -1,13 +1,13 @@
-This tutorial introduces techniques we used to profile and tune the
+This tutorial introduces techniques we use to profile and tune the
 CPU performance of PaddlePaddle.  We will use Python packages
-`cProfile` and `yep`, and Google `perftools`.
+`cProfile` and `yep`, and Google's `perftools`.

-Profiling is the process that reveals the performance bottlenecks,
+Profiling is the process that reveals performance bottlenecks,
 which could be very different from what's in the developers' mind.
-Performance tuning is to fix the bottlenecks. Performance optimization
+Performance tuning is done to fix these bottlenecks. Performance optimization
 repeats the steps of profiling and tuning alternatively.

-PaddlePaddle users program AI by calling the Python API, which calls
+PaddlePaddle users program AI applications by calling the Python API, which calls
 into `libpaddle.so.` written in C++.  In this tutorial, we focus on
 the profiling and tuning of

@ -82,7 +82,7 @@ focus on. We can sort above profiling file by tottime:

 We can see that the most time-consuming function is the `built-in
 method run`, which is a C++ function in `libpaddle.so`.  We will
-explain how to profile C++ code in the next section.  At the right
+explain how to profile C++ code in the next section.  At this 
 moment, let's look into the third function `sync_with_cpp`, which is a
 Python function.  We can click it to understand more about it:

@ -135,8 +135,8 @@ to generate the profiling file.  The default filename is
 `main.py.prof`.

 Please be aware of the `-v` command line option, which prints the
-analysis results after generating the profiling file.  By taking a
-glance at the print result, we'd know that if we stripped debug
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
 information from `libpaddle.so` at build time.  The following hints
 help make sure that the analysis results are readable:

@ -155,9 +155,9 @@ help make sure that the analysis results are readable:
   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
   starting multiple threads.

-### Look into the Profiling File
+### Examining the Profiling File

-The tool we used to look into the profiling file generated by
+The tool we used to examine the profiling file generated by
 `perftools` is [`pprof`](https://github.com/google/pprof), which
 provides a Web-based GUI like `cprofilev`.

@ -194,4 +194,4 @@ time, and `MomentumOp` takes about 17%. Obviously, we'd want to
 optimize `MomentumOp`.

 `pprof` would mark performance critical parts of the program in
-red. It's a good idea to follow the hint.
+red. It's a good idea to follow the hints.
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@ -4,6 +4,16 @@ else ()
  set(PADDLE_FLOAT_TYPE float)
 endif()

+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
+  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT PADDLE_GIT_COMMIT)
+  set(PADDLE_GIT_COMMIT "no commit information")
+endif()
+
 # config.h used for C-API. It will store Paddle building configuration as a
 # header. Make user just include PaddleCAPI.h then can get building
 # configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
--- a/paddle/capi/config.h.in
+++ b/paddle/capi/config.h.in
@ -3,6 +3,9 @@

 typedef @PADDLE_FLOAT_TYPE@ paddle_real;

+#define __PADDLE_VERSION__  "@PADDLE_VERSION@"
+#define __PADDLE_COMMIT__   "@PADDLE_GIT_COMMIT@"
+
 // Since we only support linux and macos in compile, always use clang or
 // gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
 #define PD_API __attribute__((visibility("default")))
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@ -22,6 +22,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim(
  return GetDims(names);
 }

+DDim InferShapeContext::GetInputsElementDim(const std::string &name,
+                                            int idx) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return this->GetDim(names[idx]);
+}
+
 void InferShapeContext::SetOutputsDim(
    const std::string &name, const std::vector<framework::DDim> &dims) {
  auto &names = Outputs(name);
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@ -37,6 +37,7 @@ class InferShapeContext {
  virtual framework::DDim GetInputDim(const std::string &name) const = 0;

  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  DDim GetInputsElementDim(const std::string &name, int idx) const;

  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
  void SetOutputsDim(const std::string &name,
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@ -21,7 +21,7 @@ template <class T>
 struct EigenBlasGemm {
  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
                           Eigen::Aligned>
-      Matrix;
+      EigenMatrix;

  static void compute(const bool transA,
                      const bool transB,
@ -56,14 +56,13 @@ struct EigenBlasGemm {
      sizeB[1] = N;
      CHECK_EQ(N, ldb);
    }
-    Eigen::array<int, 2> sizeC;
-    sizeC[0] = M;
-    sizeC[1] = N;
-    CHECK_EQ(N, ldc);
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
+    Eigen::array<int, 2> offsetC = {{0, 0}};
+    Eigen::array<int, 2> extentC = {{M, N}};

-    const Matrix a(const_cast<T*>(A), sizeA);
-    const Matrix b(const_cast<T*>(B), sizeB);
-    Matrix c(C, sizeC);
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
+    EigenMatrix c(C, sizeC);

    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
    Eigen::array<DimPair, 1> dims;
@ -72,12 +71,23 @@ struct EigenBlasGemm {
    dims[0].second = transB ? 1 : 0;

    Eigen::DefaultDevice device;
-    if (alpha == T(1) && beta == T(0)) {
-      c.device(device) = a.contract(b, dims);
-    } else if (alpha == T(1) && beta == T(1)) {
-      c.device(device) += a.contract(b, dims);
+    if (N == ldc) {
+      if (alpha == T(1) && beta == T(0)) {
+        c.device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(device) += a.contract(b, dims);
+      } else {
+        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      }
    } else {
-      c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
    }
  }
 };
--- a/Show More
+++ b/Show More