Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/combine_open_files_and_double_buffer

7 years ago · be528f9815
parent 72b78154b2 968922bd6c
commit be528f9815
61 changed files with 828 additions and 516 deletions
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@ -210,7 +210,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
-            v = startup_prog.global_block().clone_variable(var)
+            v = startup_prog.global_block()._clone_variable(var)
            var.persistable = True
            v.persistable = True

--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@ -98,13 +98,13 @@ class Block(objects):
    def append_operator(self, ...):
        self.ops.append(Operator(self, ...))

-    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+    def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
       self.ops.prepend(Operator(self, ...))
 ```

 `create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.

-`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.

 ### Operator

--- a/doc/fluid/howto/performance/error_clip.md
+++ b/doc/fluid/howto/performance/error_clip.md
@ -78,7 +78,7 @@ def error_clip_callback(block, context):
    op_desc = block.desc.op(block.desc.op_size() - 1)
    for grad_n in filter(lambda n: grad_to_var.has_key(n),
                         op_desc.output_arg_names()):
-        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        fwd_var = block.__var_recursive(grad_to_var[grad_n])
        error_clip = getattr(fwd_var, "error_clip", None)
        if not (error_clip is None or isinstance(error_clip,
                                                 BaseErrorClipAttr)):
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@ -35,11 +35,16 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
   docker build -t paddle:dev .
   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build

-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.

 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：

--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@ -36,13 +36,18 @@ If you don't wish to use docker，you need to install several compile dependenci
   # 2. Optional: build development docker image from source
   docker build -t paddle:dev .
   # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build

-NOTE: The above command try to mount the current working directory (root directory of source code)
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
 into :code:`/paddle` directory inside docker container.

+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
 machine or copy it to the target machine.
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@ -118,7 +118,7 @@ class Float16Transpiler:

        for var in self.block.vars.keys():
            if var not in args:
-                self.block.remove_var(var)
+                self.block._remove_var(var)

    def _modify_feed_fetch(self):
        '''
@ -165,7 +165,7 @@ class Float16Transpiler:
                    dtype=core.VarDesc.VarType.FP16,
                    shape=var.shape,
                    persistable=var.persistable)
-                self.block.insert_op(
+                self.block._insert_op(
                    i + 1,
                    type="cast",
                    inputs={"X": var},
@ -188,7 +188,7 @@ class Float16Transpiler:
                    persistable=var.persistable)
                find_op(var)
                var.op.rename_output(var_name, tmp_var_name)
-                self.block.insert_op(
+                self.block._insert_op(
                    i,
                    type="cast",
                    inputs={"X": tmp_var},
@ -253,4 +253,4 @@ class Float16Transpiler:

            # old var will be replaced by the fp16 var in program desc
            self.input_map[var.name] = fp16_var_name
-            self.block.remove_var(var.name)
+            self.block._remove_var(var.name)
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@ -104,7 +104,3 @@ if (WITH_ANAKIN) # only needed in CI
        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     endif(WITH_TESTING)
 endif()
-
-if(WITH_TESTING)
-    add_subdirectory(demo)
-endif()
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@ -1,59 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
-if(NOT WITH_INFERENCE_DEMO)
-  return()
-endif()
-
-set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
-set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
-
-function(inference_download_test_demo TARGET)
-    if (NOT WITH_TESTING)
-        return()
-    endif()
-    set(options "")
-    set(oneValueArgs URL)
-    set(multiValueArgs SRCS)
-    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
-    message(STATUS "inference demo ${test_dir}")
-
-    if(NOT EXISTS "${test_dir}")
-        message(STATUS "Download ${TARGET} model from ${tests_URL}")
-        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
-    endif()
-
-    cc_test(${TARGET} SRCS "${tests_SRCS}"
-        DEPS paddle_inference_api paddle_fluid
-        ARGS --data=${test_dir}/data.txt
-             --modeldir=${test_dir}/model
-             --refer=${test_dir}/result.txt)
-endfunction()
-
-# disable mobilenet test
-#inference_download_test_demo(mobilenet_inference_demo
-#    SRCS vis_demo.cc
-#    URL ${URL_ROOT}mobilenet.tar.gz)
-inference_download_test_demo(se_resnext50_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}se_resnext50.tar.gz)
-inference_download_test_demo(ocr_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}ocr.tar.gz)
--- a/paddle/contrib/inference/demo/README.md
+++ b/paddle/contrib/inference/demo/README.md
@ -1,36 +0,0 @@
-# Infernce Demos
-
-Input data format:
-
- Each line contains a single record
- Each record's format is
-
-```
-<space splitted floats as data>\t<space splitted ints as shape>
-```
-
-Follow the C++ codes in `vis_demo.cc`.
-
-## MobileNet
-
-To execute the demo, simply run
-
-```sh
-./mobilenet_inference_demo --modeldir <model> --data <datafile>
-```
-
-## SE-ResNeXt-50
-
-To execute the demo, simply run
-
-```sh
-./se_resnext50_inference_demo --modeldir <model> --data <datafile>
-```
-
-## OCR
-
-To execute the demo, simply run
-
-```sh
-./ocr_inference_demo --modeldir <model> --data <datafile>
-```
--- a/paddle/contrib/inference/demo_ci/.gitignore
+++ b/paddle/contrib/inference/demo_ci/.gitignore
@ -0,0 +1 @@
+data
--- a/paddle/contrib/inference/demo_ci/CMakeLists.txt
+++ b/paddle/contrib/inference/demo_ci/CMakeLists.txt
@ -52,14 +52,12 @@ else()
  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
 endif()

+# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
  set(DEPS
-      "-Wl,--whole-archive"
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
-      "-Wl,--no-whole-archive"
-      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a)
+      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
 else()
-  # Note: libpaddle_inference_api.so must put before libpaddle_fluid.so
  set(DEPS
      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.so
      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
--- a/paddle/contrib/inference/demo_ci/README.md
+++ b/paddle/contrib/inference/demo_ci/README.md
@ -0,0 +1,26 @@
+# Inference Demos
+
+There are several demos:
+
+- simple_on_word2vec: 
+  - Follow the C++ codes is in `simple_on_word2vec.cc`. 
+  - It is suitable for word2vec model.
+- vis_demo: 
+  - Follow the C++ codes is in `vis_demo.cc`. 
+  - It is suitable for mobilenet, se_resnext50 and ocr three models.
+  - Input data format:
+    - Each line contains a single record
+    - Each record's format is
+    ```
+    <space splitted floats as data>\t<space splitted ints as shape>
+    ```
+
+To build and execute the demos, simply run 
+```
+./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU
+```
+- It will build and execute the demos in both static and shared library.
+- `$PADDLE_ROOT`: paddle library path
+- `$TURN_ON_MKL`: use MKL or Openblas
+- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode
+- NOTE: for simple_on_word2vec, must run `ctest -R test_word2vec -R` to obtain word2vec model at first.
--- a/paddle/contrib/inference/demo_ci/run.sh
+++ b/paddle/contrib/inference/demo_ci/run.sh
@ -1,34 +1,81 @@
 set -x
 PADDLE_ROOT=$1
-WITH_MKL=$2
-WITH_GPU=$3
-if [ $3 == "ON" ]; then
+TURN_ON_MKL=$2 # use MKL or Openblas
+TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
+if [ $2 == ON ]; then
+  # You can export yourself if move the install path
+  MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
+fi
+if [ $3 == ON ]; then
  use_gpu_list='true false'
 else    
  use_gpu_list='false'
 fi

+# download vis_demo data
+function download() {
+  dir_name=$1
+  mkdir -p $dir_name
+  cd $dir_name
+  wget -q ${URL_ROOT}$dir_name.tar.gz
+  tar xzf *.tar.gz
+  cd ..
+}
+URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
+mkdir -p data
+cd data
+vis_demo_list='se_resnext50 ocr mobilenet'
+for vis_demo_name in $vis_demo_list; do
+  download $vis_demo_name
+done
+cd ..
+
+# compile and test the demo
 mkdir -p build
 cd build

-for WITH_STATIC_LIB in false; do
+for WITH_STATIC_LIB in ON OFF; do
+  # -----simple_on_word2vec-----
  rm -rf *
  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
-    -DWITH_MKL=$WITH_MKL \
+    -DWITH_MKL=$TURN_ON_MKL \
    -DDEMO_NAME=simple_on_word2vec \
-    -DWITH_GPU=$WITH_GPU \
+    -DWITH_GPU=$TEST_GPU_CPU \
    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
-  make
-  for use_gpu in $use_gpu_list; do
-    ./simple_on_word2vec \
-      --dirname=${PADDLE_ROOT}/build/python/paddle/fluid/tests/book/word2vec.inference.model \
-      --use_gpu=$use_gpu
+  make -j
+  word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
+  if [ -d $word2vec_model ]; then
+    for use_gpu in $use_gpu_list; do
+      ./simple_on_word2vec \
+        --dirname=$word2vec_model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
+  fi
+  # ---------vis_demo---------
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$TURN_ON_MKL \
+    -DDEMO_NAME=vis_demo \
+    -DWITH_GPU=$TEST_GPU_CPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make -j
+  for use_gpu in false; do
+    for vis_demo_name in $vis_demo_list; do 
+      ./vis_demo \
+        --modeldir=../data/$vis_demo_name/model \
+        --data=../data/$vis_demo_name/data.txt \
+        --refer=../data/$vis_demo_name/result.txt \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "vis demo $vis_demo_name runs fail."
+        exit 1
+      fi
+    done
  done
 done
-if [ $? -eq 0 ]; then
-  exit 0
-else
-  echo "inference demo runs fail."
-  exit 1
-fi
 set +x
--- a/paddle/contrib/inference/demo_ci/utils.h
+++ b/paddle/contrib/inference/demo_ci/utils.h
@ -16,7 +16,7 @@
 #include <string>
 #include <vector>

-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "contrib/inference/paddle_inference_api.h"

 namespace paddle {
 namespace demo {
--- a/paddle/contrib/inference/demo_ci/vis_demo.cc
+++ b/paddle/contrib/inference/demo_ci/vis_demo.cc
@ -18,19 +18,14 @@ limitations under the License. */

 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
-#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
-#include "paddle/contrib/inference/demo/utils.h"
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "utils.h"

 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 #endif
-
-namespace paddle {
-namespace demo {
-
 DEFINE_string(modeldir, "", "Directory of the inference model.");
 DEFINE_string(refer, "", "path to reference result for comparison.");
 DEFINE_string(
@ -38,6 +33,10 @@ DEFINE_string(
    "",
    "path of data; each line is a record, format is "
    "'<space splitted floats as data>\t<space splitted ints as shape'");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");
+
+namespace paddle {
+namespace demo {

 struct Record {
  std::vector<float> data;
@ -47,7 +46,7 @@ struct Record {
 void split(const std::string& str, char sep, std::vector<std::string>* pieces);

 Record ProcessALine(const std::string& line) {
-  LOG(INFO) << "process a line";
+  VLOG(3) << "process a line";
  std::vector<std::string> columns;
  split(line, '\t', &columns);
  CHECK_EQ(columns.size(), 2UL)
@ -65,8 +64,8 @@ Record ProcessALine(const std::string& line) {
  for (auto& s : shape_strs) {
    record.shape.push_back(std::stoi(s));
  }
-  LOG(INFO) << "data size " << record.data.size();
-  LOG(INFO) << "data shape size " << record.shape.size();
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
  return record;
 }

@ -78,20 +77,22 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
  file.close();

  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  LOG(INFO) << "predictor output numel " << numel;
-  LOG(INFO) << "reference output numel " << refer.data.size();
-  EXPECT_EQ(numel, refer.data.size());
+  VLOG(3) << "predictor output numel " << numel;
+  VLOG(3) << "reference output numel " << refer.data.size();
+  PADDLE_ENFORCE_EQ(numel, refer.data.size());
  switch (output.dtype) {
    case PaddleDType::INT64: {
      for (size_t i = 0; i < numel; ++i) {
-        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+        PADDLE_ENFORCE_EQ(static_cast<int64_t*>(output.data.data())[i],
+                          refer.data[i]);
      }
      break;
    }
    case PaddleDType::FLOAT32:
      for (size_t i = 0; i < numel; ++i) {
-        EXPECT_NEAR(
-            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+        PADDLE_ENFORCE_LT(
+            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
+            1e-5);
      }
      break;
  }
@ -106,15 +107,15 @@ void Main(bool use_gpu) {
  config.prog_file = FLAGS_modeldir + "/__model__";
  config.use_gpu = use_gpu;
  config.device = 0;
-#ifdef PADDLE_WITH_CUDA
-  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
-#endif
+  if (FLAGS_use_gpu) {
+    config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  }

-  LOG(INFO) << "init predictor";
+  VLOG(3) << "init predictor";
  auto predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);

-  LOG(INFO) << "begin to process data";
+  VLOG(3) << "begin to process data";
  // Just a single batch of data.
  std::string line;
  std::ifstream file(FLAGS_data);
@ -129,21 +130,26 @@ void Main(bool use_gpu) {
      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
      .dtype = PaddleDType::FLOAT32};

-  LOG(INFO) << "run executor";
+  VLOG(3) << "run executor";
  std::vector<PaddleTensor> output;
  predictor->Run({input}, &output);

-  LOG(INFO) << "output.size " << output.size();
+  VLOG(3) << "output.size " << output.size();
  auto& tensor = output.front();
-  LOG(INFO) << "output: " << SummaryTensor(tensor);
+  VLOG(3) << "output: " << SummaryTensor(tensor);

  // compare with reference result
  CheckOutput(FLAGS_refer, tensor);
 }

-TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
-#endif
 }  // namespace demo
 }  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
+  }
+  return 0;
+}
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@ -54,6 +54,7 @@ bool PaddleInferenceAnakinPredictor::Run(
      LOG(ERROR) << "copy data from CPU to GPU error";
      return false;
    }
+    cudaStreamSynchronize(NULL);
  }

  executor_.prediction();
@ -76,6 +77,7 @@ bool PaddleInferenceAnakinPredictor::Run(
      LOG(ERROR) << "copy data from GPU to CPU error";
      return false;
    }
+    cudaStreamSynchronize(NULL);
  }
  return true;
 }
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
  }

  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToDevs(bcast_vars);
+    BCastParamsToDevices(bcast_vars);
  }
  // Startup Program has been run. All local scopes has correct parameters.

@ -140,7 +140,7 @@ ParallelExecutor::ParallelExecutor(
      member_->places_, std::move(member_->executor_)));
 }

-void ParallelExecutor::BCastParamsToDevs(
+void ParallelExecutor::BCastParamsToDevices(
    const std::unordered_set<std::string> &vars) const {
  // the initializing bcast, all vars would be bcast from device(0),
  // otherwise
@ -218,7 +218,10 @@ void ParallelExecutor::BCastParamsToDevs(

        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-        if (member_->use_all_reduce_ || member_->use_cuda_) {
+
+        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
+        if (member_->use_all_reduce_ || member_->use_cuda_ ||
+            var == "@LR_DECAY_COUNTER@") {
          t->Resize(dims);
          t->mutable_data(cpu, main_tensor.type());
          paddle::framework::TensorCopy(main_tensor, cpu, t);
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -66,7 +66,7 @@ class ParallelExecutor {
  void Run(const std::vector<std::string> &fetch_tensors,
           const std::string &fetched_var_name);

-  void BCastParamsToDevs(const std::unordered_set<std::string> &vars) const;
+  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;

 private:
  ParallelExecutorPrivate *member_;
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -1,3 +1,10 @@
+# analysis and tensorrt must be added before creating static library,
+# otherwise, there would be undefined reference to them in static library.
+add_subdirectory(analysis)
+if (TENSORRT_FOUND)
+  add_subdirectory(tensorrt)
+endif()
+
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )

 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
@ -7,10 +14,6 @@ cc_library(paddle_fluid_api

 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)

-if(WITH_CONTRIB)
-  set(fluid_modules "${fluid_modules}" paddle_inference_api)
-endif()
-
 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
 if(NOT APPLE)
@ -35,9 +38,3 @@ if(WITH_TESTING)
  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
 endif()
-
-add_subdirectory(analysis)
-
-if (TENSORRT_FOUND)
-  add_subdirectory(tensorrt)
-endif()
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@ -35,7 +35,14 @@ class AucOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(inference_height, label_height,
                      "Out and Label should have same height.");

+    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
+
    ctx->SetOutputDim("AUC", {1});
+    ctx->SetOutputDim("TPOut", {num_thres});
+    ctx->SetOutputDim("TNOut", {num_thres});
+    ctx->SetOutputDim("FPOut", {num_thres});
+    ctx->SetOutputDim("FNOut", {num_thres});
+
    ctx->ShareLoD("Out", /*->*/ "AUC");
  }

@ -63,10 +70,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data."
             "The height is batch size and width is always 1.");
+    AddInput("TP", "True-Positive value.");
+    AddInput("FP", "False-Positive value.");
+    AddInput("TN", "True-Negative value.");
+    AddInput("FN", "False-Negative value.");
    // TODO(typhoonzero): support weight input
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
+    AddOutput("TPOut", "True-Positive value.");
+    AddOutput("FPOut", "False-Positive value.");
+    AddOutput("TNOut", "True-Negative value.");
+    AddOutput("FNOut", "False-Negative value.");

    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@ -34,6 +34,12 @@ class AucKernel : public framework::OpKernel<T> {
    auto* inference = ctx.Input<Tensor>("Out");
    auto* label = ctx.Input<Tensor>("Label");
    auto* auc = ctx.Output<Tensor>("AUC");
+    // Only use output var for now, make sure it's persistable and
+    // not cleaned up for each batch.
+    auto* true_positive = ctx.Output<Tensor>("TPOut");
+    auto* false_positive = ctx.Output<Tensor>("FPOut");
+    auto* true_negative = ctx.Output<Tensor>("TNOut");
+    auto* false_negative = ctx.Output<Tensor>("FNOut");

    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());

@ -54,19 +60,10 @@ class AucKernel : public framework::OpKernel<T> {
    const T* inference_data = inference->data<T>();
    const int64_t* label_data = label->data<int64_t>();

-    // Create local tensor for storing the curve: TP, FN, TN, FP
-    // TODO(typhoonzero): use eigen op to caculate these values.
-    Tensor true_positive, false_positive, true_negative, false_negative;
-
-    true_positive.Resize({num_thresholds});
-    false_negative.Resize({num_thresholds});
-    true_negative.Resize({num_thresholds});
-    false_positive.Resize({num_thresholds});
-
-    int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
+    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
+    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
+    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
+    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());

    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
      // caculate TP, FN, TN, FP for current thresh
@ -91,10 +88,10 @@ class AucKernel : public framework::OpKernel<T> {
        }
      }
      // store rates
-      tp_data[idx_thresh] = tp;
-      fn_data[idx_thresh] = fn;
-      tn_data[idx_thresh] = tn;
-      fp_data[idx_thresh] = fp;
+      tp_data[idx_thresh] += tp;
+      fn_data[idx_thresh] += fn;
+      tn_data[idx_thresh] += tn;
+      fp_data[idx_thresh] += fp;
    }
    // epsilon to avoid divide by zero.
    float epsilon = 1e-6;
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase {
      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
              << " and dir:" << dir << " to " << epmap[i];
    }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
  }
 };

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@ -281,9 +281,10 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
  req_count_++;
 }

-void GRPCClient::Wait() {
+bool GRPCClient::Wait() {
  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
+  return ok_;
 }

 void GRPCClient::Proceed() {
@ -297,6 +298,14 @@ void GRPCClient::Proceed() {
    if (c->status_.ok()) {
      VLOG(3) << c->var_h_.String() << " process";
      c->Process();
+    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
+      LOG(ERROR) << c->var_h_.String()
+                 << " meets grpc error:" << c->status_.error_message();
+      {
+        std::lock_guard<std::mutex> lk(sync_mutex_);
+        ok_ = false;
+      }
+      sync_cond_.notify_all();
    } else {
      LOG(FATAL) << c->var_h_.String()
                 << " meets grpc error:" << c->status_.error_message();
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@ -188,7 +188,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {

 class GRPCClient : public RPCClient {
 public:
-  GRPCClient() {}
+  GRPCClient() : ok_(true) {}
  virtual ~GRPCClient();

  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
@ -221,7 +221,7 @@ class GRPCClient : public RPCClient {
  void AsyncSendEndPass(const std::string& ep,
                        int64_t time_out = FLAGS_rpc_deadline) override;

-  void Wait() override;
+  bool Wait() override;

  void SendBeginPass() override;

@ -247,6 +247,7 @@ class GRPCClient : public RPCClient {
  std::mutex sync_mutex_;
  std::condition_variable sync_cond_;
  std::atomic<int64_t> req_count_{0};
+  bool ok_;

  // mutex for GetChannel thread safety
  std::mutex chan_mutex_;
--- a/Show More
+++ b/Show More