Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/clean_blas

7 years ago · a6edeb39b3
parent caa4027d9d 73650a8320
commit a6edeb39b3
83 changed files with 1900 additions and 672 deletions
--- a/2
+++ b/2
@ -32,7 +32,7 @@ RUN apt-get update && \
    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools libtool && \
+    net-tools libtool ccache && \
    apt-get clean -y

 # Install Go and glide
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ b/benchmark/cluster/vgg16/run_vgg_dist.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Update to point to the source file.
+VGG_SRC="vgg16_fluid.py"
+
+export TRAINING_ROLE=PSERVER
+export TRAINERS=2
+export POD_IP=127.0.0.1
+export PADDLE_INIT_PORT=6174
+MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
+
+# Need to wait for the ps to start first.
+sleep 10
+echo "done start ps"
+
+export TRAINING_ROLE=TRAINER
+export TRAINERS=2
+export POD_IP=127.0.0.1
+export PADDLE_INIT_PORT=6174
+CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
+CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@ -200,18 +200,19 @@ def main():
                num_samples += len(data)
                train_pass_acc.add(value=acc, weight=b_size)
                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - ts))
+                    "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
+                    "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
+                                             loss, acc,
+                                             len(data) / (time.time() - ts))
                )  # The accuracy is the accumulation of batches, but not the current batch.

            pass_elapsed = time.time() - start_time
            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
-            print(
-                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
-                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
-                   pass_test_acc))
+            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
+                  "Train accuracy = %f, Test accuracy = %f\n" %
+                  (args.task_index, pass_id, num_samples / pass_elapsed,
+                   pass_train_acc, pass_test_acc))

    if args.local:
        # Parameter initialization
@ -239,8 +240,6 @@ def main():

        t = fluid.DistributeTranspiler()
        t.transpile(
-            optimize_ops,
-            params_grads,
            trainer_id=args.task_index,
            pservers=args.ps_hosts,
            trainers=trainers)
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat,
 paddle_error paddle_matrix_get_shape(paddle_matrix mat,
                                     uint64_t* height,
                                     uint64_t* width) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
  if (height != nullptr) {
    *height = cast(mat)->mat->getHeight();
  }
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifndef HL_BASE_H_
-#define HL_BASE_H_
+#pragma once

 #include <cstddef>

@ -207,8 +206,8 @@ typedef struct {

 #ifdef __NVCC__

-#include "cuda_runtime.h"
-#include "hl_cuda.h"
+#include <cuda_runtime.h>
+#include "paddle/cuda/include/hl_cuda.h"
 #include "paddle/utils/Logging.h"

 extern __thread bool g_sync_flag;
@ -228,6 +227,19 @@ extern __thread cudaStream_t default_stream;
        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
  }

-#endif /* __NVCC__ */
+// __shfl has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T
+__shfl_sync(unsigned, T val, int src_line, int width) {
+  return __shfl(val, src_line, width);
+}

-#endif /* HL_BASE_H_ */
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+#endif  // __NVCC__
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,
 }

 __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  int addr = idx % 32;
+  const int warp_size = 32;
+  int addr = idx % warp_size;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, addr < warp_size);
 #pragma unroll
  for (int k = 1; k < 32; k++) {
    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl_sync(addr, (idx + 1) % 32, 32);
-    a[k] = __shfl_sync(a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
  }

 #pragma unroll
@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
  }

  addr = (32 - idx) % 32;
+  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
 #pragma unroll
  for (int k = 0; k < 32; k++) {
-    a[k] = __shfl_sync(a[k], addr, 32);
-    addr = __shfl_sync(addr, (idx + 31) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
  }
 }

--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
    if (--beamSize == 0) break;
    __syncthreads();

+    unsigned mask = 0u;
+    // CREATE_SHFL_MASK(mask, tid < len);
+
    if (tid == maxId[0]) {
      if (beam < maxLength) {
        shTopK[tid] = topK[beam];
      }
    }
    if (maxId[0] / 32 == warp) {
-      if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break;
+      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
    }
  }
 }
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool skip_scale_loss,
+    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
    platform::NCCLContextMap *nccl_ctxs)
    : loss_var_name_(loss_var_name),
      places_(places),
@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool skip_scale_loss)
+    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
    : loss_var_name_(loss_var_name),
      places_(places),
      local_scopes_(local_scopes) {
@ -53,7 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
  for (auto &p : params) {
    grad_names_.insert(GradVarName(p));
  }
-  skip_scale_loss_ = skip_scale_loss;
+  use_default_grad_scale_ = use_default_grad_scale;
 }

 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@ -126,8 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    } else if (IsDistTrainOp(*op, send_op)) {
      CreateComputationalOps(&result, *op, 1);
    } else if (IsScaleLossOp(*op)) {
-      // user can customize loss@grad if skip_scale_loss_
-      if (!skip_scale_loss_) {
+      // user can customize loss@grad if not use_default_grad_scale_
+      if (use_default_grad_scale_) {
        CreateScaleLossGradOp(&result);
      }
      is_forwarding = false;
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                          const std::string &loss_var_name,
                          const std::unordered_set<std::string> &params,
                          const std::vector<Scope *> &local_scopes,
-                          bool skip_scale_loss);
+                          bool use_default_grad_scale);
 #endif

  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
  platform::NCCLContextMap *nccl_ctxs_;
 #endif
-  bool skip_scale_loss_;
+  bool use_default_grad_scale_;

  bool IsScaleLossOp(const OpDesc &op) const;

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@ -255,11 +255,11 @@ TEST(LoDTensor, RecordIO) {
    std::unique_ptr<std::istream> stream_ptr(stream);
    recordio::Scanner scanner(std::move(stream_ptr));
    auto tensors = ReadFromRecordIO(&scanner, ctx);
-    ASSERT_EQ(tensors.size(), 2);
+    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
    assert_tensor_ok(tensors[0]);
    assert_tensor_ok(tensors[1]);
    tensors = ReadFromRecordIO(&scanner, ctx);
-    ASSERT_EQ(tensors.size(), 2);
+    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
    assert_tensor_ok(tensors[0]);
    assert_tensor_ok(tensors[1]);
  }
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -93,6 +93,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
  RunImpl(scope, place);
 }

+bool OperatorBase::HasInputs(const std::string& name) const {
+  if (inputs_.find(name) != inputs_.end()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 std::string OperatorBase::Input(const std::string& name) const {
  auto& ins = Inputs(name);
  PADDLE_ENFORCE_LE(ins.size(), 1UL,
@ -109,6 +117,14 @@ const std::vector<std::string>& OperatorBase::Inputs(
  return it->second;
 }

+bool OperatorBase::HasOutputs(const std::string& name) const {
+  if (outputs_.find(name) != outputs_.end()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 std::string OperatorBase::Output(const std::string& name) const {
  auto& outs = Outputs(name);
  PADDLE_ENFORCE_LE(outs.size(), 1UL,
@ -220,13 +236,18 @@ void OperatorBase::CheckAllInputOutputSet() const {
  if (op_info == nullptr || op_info->proto_ == nullptr) return;

  for (auto& in : op_info->Proto().inputs()) {
-    PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
-                   "Type %s's input %s is not set", Type(), in.name());
+    if (!in.dispensable()) {
+      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
+                     "Operator %s's input, %s, is not set", Type(), in.name());
+    }
  }

  for (auto& out : op_info->Proto().outputs()) {
-    PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
-                   "Type %s's output %s is not set", Type(), out.name());
+    if (!out.dispensable()) {
+      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
+                     "Operator %s's output, %s, is not set", Type(),
+                     out.name());
+    }
  }
 }

@ -332,6 +353,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
      : op_(op), scope_(scope) {}

  bool HasInput(const std::string& name) const override {
+    if (!op_.HasInputs(name)) {
+      return false;
+    }
    auto& ins = Inputs(name);
    size_t length = ins.size();
    if (length == 0) {
@ -345,6 +369,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
  }

  bool HasOutput(const std::string& name) const override {
+    if (!op_.HasOutputs(name)) {
+      return false;
+    }
    auto& outs = Outputs(name);
    size_t length = outs.size();
    if (length == 0) {
@ -358,6 +385,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
  }

  bool HasInputs(const std::string& name) const override {
+    if (!op_.HasInputs(name)) {
+      return false;
+    }
    auto inputs = op_.Inputs(name);
    if (inputs.empty()) {
      return false;
@ -371,6 +401,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
  }

  bool HasOutputs(const std::string& name) const override {
+    if (!op_.HasOutputs(name)) {
+      return false;
+    }
    auto outputs = op_.Outputs(name);
    if (outputs.empty()) {
      return false;
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -105,6 +105,7 @@ class OperatorBase {
  const VariableNameMap& Inputs() const { return inputs_; }
  const VariableNameMap& Outputs() const { return outputs_; }

+  bool HasInputs(const std::string& name) const;
  //! Get a input with argument's name described in `op_proto`
  std::string Input(const std::string& name) const;
  //! Get a input which has multiple variables.
@ -112,6 +113,7 @@ class OperatorBase {
  //! Get all inputs variable names
  std::vector<std::string> InputVars() const;

+  bool HasOutputs(const std::string& name) const;
  //! Get a output with argument's name described in `op_proto`
  std::string Output(const std::string& name) const;
  //! Get an output which has multiple variables.
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor(
    const std::unordered_set<std::string> &bcast_vars,
    const ProgramDesc &main_program, const std::string &loss_var_name,
    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
-    bool customize_scale_loss)
+    bool use_default_grad_scale)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;

@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor(
 #ifdef PADDLE_WITH_CUDA
  details::MultiDevSSAGraphBuilder builder(
      member_->places_, loss_var_name, params, member_->local_scopes_,
-      customize_scale_loss, member_->nccl_ctxs_.get());
+      use_default_grad_scale, member_->nccl_ctxs_.get());
 #else
  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
                                           params, member_->local_scopes_,
-                                           customize_scale_loss);
+                                           use_default_grad_scale);
 #endif
  auto graph = builder.Build(main_program);

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -40,7 +40,7 @@ class ParallelExecutor {
                            const ProgramDesc& main_program,
                            const std::string& loss_var_name, Scope* scope,
                            const std::vector<Scope*>& local_scopes,
-                            bool allow_op_delay, bool customize_scale_loss);
+                            bool allow_op_delay, bool use_default_grad_scale);

  ~ParallelExecutor();

--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@ -120,11 +120,11 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                   : true;
 }

-std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
-                                       framework::Tensor* value) const {
+std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
+    std::vector<int64_t> keys, framework::Tensor* value) const {
  PADDLE_ENFORCE(value->IsInitialized(),
                 "The value tensor should be initialized.");
-  std::vector<int64_t> non_keys;
+  std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
  int64_t value_width = value_->numel() / value_->dims()[0];
  PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                    "output tensor should have the same shape with table "
@ -133,7 +133,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
  for (size_t i = 0; i < keys.size(); ++i) {
    int64_t index = Index(keys[i]);
    if (index == -1) {
-      non_keys.push_back(keys[i]);
+      non_keys_pair.push_back(std::make_pair(keys[i], static_cast<int64_t>(i)));
    } else {
      framework::VisitDataType(
          framework::ToDataType(value_->type()),
@ -141,7 +141,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
                            index * value_width, value_width));
    }
  }
-  return non_keys;
+  return non_keys_pair;
 }

 bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <algorithm>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/lod_tensor.h"
@ -78,10 +79,11 @@ class SelectedRows {
  /*
   * @brief Get value by the key list, if the
   *
-   * @return a list of keys which does not exists in table
+   * @return a list of pair which contains the non-exists key and the index in
+   * the value
   */
-  std::vector<int64_t> Get(std::vector<int64_t> keys,
-                           framework::Tensor* tensor) const;
+  std::vector<std::pair<int64_t, int64_t>> Get(std::vector<int64_t> keys,
+                                               framework::Tensor* value) const;

  /*
   * @brief Set a key-value pair into the table.
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@ -59,7 +59,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }

-TEST_F(SelectedRowsTester, Table) {
+TEST_F(SelectedRowsTester, SparseTable) {
  platform::CPUPlace cpu;
  SelectedRows table;
  // initialize a sparse table
@ -87,11 +87,11 @@ TEST_F(SelectedRowsTester, Table) {
  framework::Tensor get_value;
  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
  std::vector<int64_t> keys({non_key, key});
-  auto non_keys = table.Get(keys, &get_value);
+  auto non_key_pairs = table.Get(keys, &get_value);

  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
-  ASSERT_EQ(non_keys.size(), static_cast<size_t>(1));
-  ASSERT_EQ(non_keys[0], non_key);
+  ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
+  ASSERT_EQ(non_key_pairs[0].first, non_key);
 }

 }  // namespace framework
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@ -65,7 +65,7 @@ class TensorRTEngine : public EngineBase {
  // Initialize the inference network, so that TensorRT layers can add to this
  // network.
  void InitNetwork() {
-    infer_builder_.reset(createInferBuilder(logger_));
+    infer_builder_.reset(createInferBuilder(&logger_));
    infer_network_.reset(infer_builder_->createNetwork());
  }
  // After finishing adding ops, freeze this network and creates the executation
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@ -46,13 +46,13 @@ const int kDataTypeSize[] = {
 // The following two API are implemented in TensorRT's header file, cannot load
 // from the dynamic library. So create our own implementation and directly
 // trigger the method from the dynamic library.
-static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) {
+static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
  return static_cast<nvinfer1::IBuilder*>(
-      dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+      dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
 }
-static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) {
+static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
  return static_cast<nvinfer1::IRuntime*>(
-      dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
 }

 // A logger for create TensorRT infer builder.
@ -80,7 +80,7 @@ class NaiveLogger : public nvinfer1::ILogger {
    return *x;
  }

-  virtual ~NaiveLogger() override {}
+  ~NaiveLogger() override {}
 };

 }  // namespace tensorrt
--- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <cuda.h>
+#include <cuda_runtime_api.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include "NvInfer.h"
-#include "cuda.h"
-#include "cuda_runtime_api.h"
 #include "paddle/fluid/platform/dynload/tensorrt.h"

 namespace dy = paddle::platform::dynload;
@ -43,7 +43,7 @@ class Logger : public nvinfer1::ILogger {

 class ScopedWeights {
 public:
-  ScopedWeights(float value) : value_(value) {
+  explicit ScopedWeights(float value) : value_(value) {
    w.type = nvinfer1::DataType::kFLOAT;
    w.values = &value_;
    w.count = 1;
@ -58,13 +58,13 @@ class ScopedWeights {
 // The following two API are implemented in TensorRT's header file, cannot load
 // from the dynamic library. So create our own implementation and directly
 // trigger the method from the dynamic library.
-nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) {
+nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
  return static_cast<nvinfer1::IBuilder*>(
-      dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+      dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
 }
-nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) {
+nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
  return static_cast<nvinfer1::IRuntime*>(
-      dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
 }

 const char* kInputTensor = "input";
@ -74,7 +74,7 @@ const char* kOutputTensor = "output";
 nvinfer1::IHostMemory* CreateNetwork() {
  Logger logger;
  // Create the engine.
-  nvinfer1::IBuilder* builder = createInferBuilder(logger);
+  nvinfer1::IBuilder* builder = createInferBuilder(&logger);
  ScopedWeights weights(2.);
  ScopedWeights bias(3.);

@ -103,9 +103,9 @@ nvinfer1::IHostMemory* CreateNetwork() {
  return model;
 }

-void Execute(nvinfer1::IExecutionContext& context, const float* input,
+void Execute(nvinfer1::IExecutionContext* context, const float* input,
             float* output) {
-  const nvinfer1::ICudaEngine& engine = context.getEngine();
+  const nvinfer1::ICudaEngine& engine = context->getEngine();
  // Two binds, input and output
  ASSERT_EQ(engine.getNbBindings(), 2);
  const int input_index = engine.getBindingIndex(kInputTensor);
@ -119,7 +119,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input,
  // Copy the input to the GPU, execute the network, and copy the output back.
  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
                               cudaMemcpyHostToDevice, stream));
-  context.enqueue(1, buffers, stream, nullptr);
+  context->enqueue(1, buffers, stream, nullptr);
  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
                               cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);
@ -136,7 +136,7 @@ TEST(TensorrtTest, BasicFunction) {

  // Use the model to create an engine and an execution context.
  Logger logger;
-  nvinfer1::IRuntime* runtime = createInferRuntime(logger);
+  nvinfer1::IRuntime* runtime = createInferRuntime(&logger);
  nvinfer1::ICudaEngine* engine =
      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
  model->destroy();
@ -145,7 +145,7 @@ TEST(TensorrtTest, BasicFunction) {
  // Execute the network.
  float input = 1234;
  float output;
-  Execute(*context, &input, &output);
+  Execute(context, &input, &output);
  EXPECT_EQ(output, input * 2 + 3);

  // Destroy the engine.
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif

 namespace paddle {
 namespace operators {
@ -87,9 +90,13 @@ class BatchNormOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext &ctx) const override {
    auto input_data_type =
        framework::ToDataType(ctx.Input<Tensor>("X")->type());
-    // For float or float16 input tensor, the type of the scale, bias, mean,
-    // and var tensors should both be float.
+    // By default, the type of the scale, bias, mean,
+    // and var tensors should both be float. (For float or float16 input tensor)
+    // or double (For double input tensor).
    auto bn_param_type = framework::proto::VarType::FP32;
+    if (input_data_type == framework::proto::VarType::FP64) {
+      bn_param_type = framework::proto::VarType::FP64;
+    }
    PADDLE_ENFORCE_EQ(bn_param_type,
                      framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
                      "Scale input should be of float type");
@ -102,7 +109,18 @@ class BatchNormOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
                                         ctx.Input<Tensor>("Variance")->type()),
                      "Variance input should be of float type");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+    }
+#endif
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library_);
  }
 };

@ -147,6 +165,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
              "Variance of the current mini batch, "
              "will apply to output when training")
        .AsIntermediate();
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Batch Normalization.

@ -345,8 +366,19 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
    if (t == nullptr) {
      PADDLE_THROW("can't find Y@GRAD");
    }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
+
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+    }
+#endif
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        layout, library_);
  }
 };

@ -470,6 +502,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));

    op->SetInput("Scale", Input("Scale"));
+    op->SetInput("Bias", Input("Bias"));
    op->SetInput("SavedMean", Output("SavedMean"));
    op->SetInput("SavedVariance", Output("SavedVariance"));

@ -492,8 +525,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
 REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);

 REGISTER_OP_CPU_KERNEL(
-    batch_norm,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
+    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@ -287,6 +287,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
+    ops::BatchNormKernel<plat::CUDADeviceContext, double>,
    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>);
+    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
+    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@ -164,11 +164,13 @@ or not. But the output only shares the LoD information with input X.
 }  // namespace paddle

 namespace ops = paddle::operators;
+using CPUCtx = paddle::platform::CPUDeviceContext;
+
 REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
-                       ops::CrossEntropyOpKernel<double>);
+REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
+                       ops::CrossEntropyOpKernel<CPUCtx, double>);
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
-                       ops::CrossEntropyGradientOpKernel<float>,
-                       ops::CrossEntropyGradientOpKernel<double>);
+                       ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
+                       ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@ -14,98 +14,11 @@ limitations under the License. */

 #include "paddle/fluid/operators/cross_entropy_op.h"

-namespace paddle {
-namespace operators {
-
-namespace {
-
-template <typename T>
-__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                           const int64_t* label, const int N,
-                                           const int D) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    int idx = i * D + label[i];
-    dX[idx] = -dY[i] / X[idx];
-  }
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                               const T* label, const int N,
-                                               const int D) {
-  int ids = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ids < N * D) {
-    int row_ids = ids / D;
-    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
-  }
-}
-}  // namespace
-
-template <typename T>
-class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
-        ctx.Attr<bool>("soft_label"));
-  }
-};
-
-template <typename T>
-class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    const T* dy_data =
-        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    const T* x_data = x->data<T>();
-
-    int64_t batch_size = x->dims()[0];
-    int64_t class_num = x->dims()[1];
-
-    int block = 512;
-    int grid = (batch_size * class_num + block - 1) / block;
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          dx_data, dy_data, x_data, label_data, batch_size, class_num);
-    } else {
-      math::SetConstant<platform::CUDADeviceContext, T> functor;
-      functor(dev_ctx, dx, 0);
-      auto* label_data = label->data<int64_t>();
-      grid = (batch_size + block - 1) / block;
-      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          dx_data, dy_data, x_data, label_data, batch_size, class_num);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
-                        ops::CrossEntropyOpCUDAKernel<double>);
+using CUDACtx = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(cross_entropy,
+                        ops::CrossEntropyOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyOpKernel<CUDACtx, double>);
 REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
-                        ops::CrossEntropyGradientOpCUDAKernel<float>,
-                        ops::CrossEntropyGradientOpCUDAKernel<double>);
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
--- a/Show More
+++ b/Show More