add cudnn back. staged.

7 years ago · 7141debe38
parent 09409bad4d
commit 7141debe38
16 changed files with 287 additions and 244 deletions
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -299,16 +299,19 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
  std::unique_ptr<ExecutorPrepareContext> ctx(
      new ExecutorPrepareContext(program, block_id));
  VLOG(3) << "after create prepare";
- // PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
+  // PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  VLOG(3) << "before create op_desc";
  auto& block = program.Block(block_id);
-  VLOG(3) << "create before" << ctx->ops_.size() << " " << block.AllOps().size();
+  VLOG(3) << "create before" << ctx->ops_.size() << " "
+          << block.AllOps().size();
  int counter = 0;
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-      VLOG(3) << "create op " << "index " << ++counter << " type " << op_desc->Type();
+    VLOG(3) << "create op "
+            << "index " << ++counter << " type " << op_desc->Type();
  }
-  VLOG(3) << "create finished" << ctx->ops_.size() << " " << block.AllOps().size();
+  VLOG(3) << "create finished" << ctx->ops_.size() << " "
+          << block.AllOps().size();
  return ctx;
 }

@ -320,22 +323,25 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
  for (auto& bid : block_ids) {
    VLOG(3) << "block id" << bid;
    auto* ctx = new ExecutorPrepareContext(program, bid);
-    //PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    // PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
    auto& block = program.Block(bid);
    int counter = 0;
-    VLOG(3) << "create before" << ctx->ops_.size() << " " << block.AllOps().size();
+    VLOG(3) << "create before" << ctx->ops_.size() << " "
+            << block.AllOps().size();
    for (auto& op_desc : block.AllOps()) {
-
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-      VLOG(3) << "create op " << "index " << ++counter << " type " << op_desc->Type();
+      VLOG(3) << "create op "
+              << "index " << ++counter << " type " << op_desc->Type();
    }
-    VLOG(3) << "create finished" << ctx->ops_.size() << " " << block.AllOps().size();
+    VLOG(3) << "create finished" << ctx->ops_.size() << " "
+            << block.AllOps().size();
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
  }
  return result;
 }

-// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx, Scope* local_scope) {
+// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx,
+// Scope* local_scope) {
 //     VLOG(3) << "before checking result";
 //   auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
 //   std::vector<std::string> outputs;
@ -343,7 +349,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 //   bool found = false;
 //   framework::OpDesc* myop = nullptr;
 //   for(auto& op : block.AllOps()) {
-//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") return;
+//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
+//     "feed") return;
 //     if (op->Type() == op_type) {
 //         found = true;
 //         myop = op;
@ -370,7 +377,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 //       for(size_t i=0; i < check.numel(); ++i) {
 //           sum += check.data<float>()[i];
 //       }
-//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
+//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
+//       << sum;
 //   VLOG(3) << "after checking result";
 // }

@ -389,11 +397,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  VLOG(3) << "Scope ptr " << local_scope;
  for (auto& op : ctx->ops_) {
    op->Run(*local_scope, place_);
-   // CheckResult(op->Type(), ctx, local_scope);
-    if (FLAGS_benchmark) {
-      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
-              << memory::memory_usage(place_);
-    }
+    // CheckResult(op->Type(), ctx, local_scope);
+    // if (FLAGS_benchmark) {
+    //   VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+    //           << memory::memory_usage(place_);
+    // }
+    VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+            << memory::memory_usage(place_);
+    // platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }
  platform::DeviceContextPool::Instance().Get(place_)->Wait();

@ -403,13 +414,15 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  // auto& block = ctx->prog_.Block(0);

  // for(auto& op : block.AllOps()) {
-  //   if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") continue;
+  //   if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
+  //   "feed") continue;
  //   // for(auto& real_op : ctx->ops_) {
  //   //   if(real_op->Type() == op->Type()) {
-  //   //     VLOG(3) << real_op->Type() << " " <<place_ << " " << real_op->DebugStringEx(local_scope);
+  //   //     VLOG(3) << real_op->Type() << " " <<place_ << " " <<
+  //   real_op->DebugStringEx(local_scope);
  //   //   }
  //   // }
-     
+
  //    //VLOG(3) << "start op output" << op->Type();
  //     for(auto var_name: op->InputArgumentNames()) {
  //     auto* var = local_scope->Var(var_name);
@ -418,19 +431,21 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  //     auto* tensor = var->GetMutable<framework::LoDTensor>();
  //     framework::Tensor check;
  //     VLOG(3) << "before tensor copy";
-   
+
  //     framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
-      
+
  //     VLOG(3) << "after tensor copy";
  //     float sum = .0;
  //     for(size_t i=0; i < check.numel(); ++i) {
-  //       if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) {
+  //       if(std::type_index(check.type()) == std::type_index(typeid(int64_t)))
+  //       {
  //         sum += static_cast<float>(check.data<int64_t>()[i]);
  //       } else {
  //         sum += check.data<float>()[i];
  //       }
  //     }
-  //     VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum " << sum;
+  //     VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum "
+  //     << sum;
  //   }

  //   VLOG(3) << "op " << op->Type() << "input finished";
@ -442,23 +457,28 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  //     framework::Tensor check;
  //     VLOG(3) << "before tensor copy";
  //     if(op->Type() == "batch_norm" && platform::is_gpu_place(place_)) {
-  //       VLOG(3) << "op " << op->Type() << " output var " << var_name << " " << tensor->numel();
+  //       VLOG(3) << "op " << op->Type() << " output var " << var_name << " "
+  //       << tensor->numel();
  //       tensor->mutable_data<float>(place_);
-  //        framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+  //        framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx,
+  //        &check);
  //     } else {
-  //        framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+  //        framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx,
+  //        &check);
  //     }
-      
+
  //     VLOG(3) << "after tensor copy";
  //     float sum = .0;
  //     for(size_t i=0; i < check.numel(); ++i) {
-  //       if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) {
+  //       if(std::type_index(check.type()) == std::type_index(typeid(int64_t)))
+  //       {
  //         sum += static_cast<float>(check.data<int64_t>()[i]);
  //       } else {
  //         sum += check.data<float>()[i];
  //       }
  //     }
-  //     VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
+  //     VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
+  //     << sum;
  //   }
  // }

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@ -50,19 +50,41 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  const std::vector<std::string> &Outputs(
      const std::string &name) const override;

+  void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
+                size_t j = 0) override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    const std::string &input_n = Inputs(in)[i];
+    const std::string &output_n = Outputs(out)[j];
+
+    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
+                   in, i);
+    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", out, j);
+
+    auto *in_var = block_.FindVarRecursive(input_n);
+    auto *out_var = block_.FindVarRecursive(output_n);
+
+    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
+                   "The type of %s and %s is not the same.", input_n, output_n);
+
+    SetDim(output_n, GetDim(input_n));
+  }
+
  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                size_t j = 0) const override {
    PADDLE_ENFORCE_LT(i, Inputs(in).size());
    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", in, i);
+    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", out, j);
    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
    if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
      VLOG(3) << "input " << in << " is not LodTensor";
      return;
    }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR,
-                      "The %d-th output of Output(%s) must be LoDTensor.", j,
-                      out);
    out_var->SetLoDLevel(in_var->GetLoDLevel());
  }

@ -441,7 +463,10 @@ static void InitInferShapeFuncs() {

    for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
      auto op_type = kern_pair.first;
-      auto &op_info = info_map.at(op_type);
+      auto it = info_map.find(op_type);
+      PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered",
+                     op_type);
+      auto &op_info = it->second;
      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
      if (op_info.infer_shape_) {  // infer_shape has been registered.
@ -490,20 +515,14 @@ void OpDesc::InferShape(const BlockDesc &block) const {
 }

 void OpDesc::InferVarType(BlockDesc *block) const {
+  // There are a few places that var type can be set.
+  // When VarDesc is created, default set to LOD_TENSOR.
+  // When output variable is created, default is defaut set to LOD_TENSOR.
+  // We limit here to be the only place that operator defines its customized
+  // var type inference. Hence, we don't do any "default" setting here.
  auto &info = OpInfoMap::Instance().Get(this->Type());
  if (info.infer_var_type_) {
    info.infer_var_type_(*this, block);
-  } else {
-    // all output type is LoDTensor by default
-    VLOG(10) << this->Type()
-             << " has not registered InferVarType. Set output variables to "
-                "LOD_TENSOR";
-    for (auto &out_pair : this->outputs_) {
-      for (auto &out_var_name : out_pair.second) {
-        block->FindRecursiveOrCreateVar(out_var_name)
-            .SetType(proto::VarType::LOD_TENSOR);
-      }
-    }
  }
 }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@ -100,16 +100,6 @@ class OpDesc {
  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }

-  void SetInputMap(const VariableNameMap &input) {
-    this->inputs_ = input;
-    this->need_update_ = true;
-  }
-
-  void SetOutputMap(const VariableNameMap &output) {
-    this->outputs_ = output;
-    this->need_update_ = true;
-  }
-
  const VariableNameMap &Inputs() const { return inputs_; }

  const VariableNameMap &Outputs() const { return outputs_; }
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -62,7 +62,7 @@ static DDim GetDims(const Scope& scope, const std::string& name,

  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (!tensor.IsInitialized()) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
      return DDim({-1});
    }
    return tensor.dims();
@ -91,13 +91,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {

  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (!tensor.IsInitialized()) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
      return "";
    }
    return DataTypeToString(ToDataType(tensor.type()));
  } else if (var->IsType<SelectedRows>()) {
    auto tensor = var->Get<SelectedRows>().value();
-    if (!tensor.IsInitialized()) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
      return "uninited";
    } else {
      return DataTypeToString(ToDataType(tensor.type()));
@ -130,7 +130,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {

  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (!tensor.IsInitialized()) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
      return default_lod;
    }
    return tensor.lod();
@ -149,11 +149,13 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    platform::SetDeviceId(dev_id);
 #endif
  }
-  VLOG(3) << "start pool";
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
-  VLOG(3) << "start RunImpl";
+
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+
  RunImpl(scope, place);
+
  VLOG(3) << place << " " << DebugStringEx(&scope);
 }

@ -206,7 +208,6 @@ const std::vector<std::string>& OperatorBase::Outputs(
 }

 std::string OperatorBase::DebugStringEx(const Scope* scope) const {
-  VLOG(3) << this->Type() << " scope ptr " << scope;
  std::stringstream ss;
  ss << "Op(" << type_ << "), inputs:{";
  for (auto it = inputs_.begin(); it != inputs_.end();) {
@ -470,35 +471,35 @@ class RuntimeInferShapeContext : public InferShapeContext {
      : op_(op), scope_(scope) {}

  bool HasInput(const std::string& name) const override {
-    if (!op_.HasInputs(name)) {
+    // has only one input
+    const auto& ins = op_.Inputs();
+    auto it = ins.find(name);
+    if (it == ins.end()) {
      return false;
    }
-    auto& ins = Inputs(name);
-    size_t length = ins.size();
-    if (length == 0) {
+    const auto& in = it->second;
+    if (in.size() == 0 || in[0] == kEmptyVarName) {
      return false;
    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
+    PADDLE_ENFORCE_EQ(in.size(), 1UL,
                      "Input %s should not have more than one inputs", name);
-    auto ipt = ins[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+    return scope_.FindVar(in[0]) != nullptr;
  }

  bool HasOutput(const std::string& name) const override {
-    if (!op_.HasOutputs(name)) {
+    // has only one output
+    const auto& outs = op_.Outputs();
+    auto it = outs.find(name);
+    if (it == outs.end()) {
      return false;
    }
-    auto& outs = Outputs(name);
-    size_t length = outs.size();
-    if (length == 0) {
+    const auto& out = it->second;
+    if (out.size() == 0 || out[0] == kEmptyVarName) {
      return false;
    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output %s should not have more than one inputs", name);
-    auto ipt = outs[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+    PADDLE_ENFORCE_EQ(out.size(), 1UL,
+                      "Output %s should not have more than one outputs", name);
+    return scope_.FindVar(out[0]) != nullptr;
  }

  bool HasInputs(const std::string& name) const override {
@ -545,13 +546,45 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return op_.Outputs(name);
  }

-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
+  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) override {
    PADDLE_ENFORCE_LT(i, Inputs(in).size());
    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    const std::string& input_n = Inputs(in)[i];
+    const std::string& output_n = Outputs(out)[j];
+
+    Variable* in_var = scope_.FindVar(input_n);
+    Variable* out_var = scope_.FindVar(output_n);
+    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
+                   "The type of %s and %s is not the same.", output_n,
+                   GetDim(input_n));
+
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
+      out_sele_rows->set_rows(in_sele_rows.rows());
+      out_sele_rows->set_height(in_sele_rows.height());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
+      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
+      out_lod_tensor->Resize(in_lod_tensor.dims());
+    } else {
+      PADDLE_THROW(
+          "Currently, the input type of ShareDim only can be LoDTensor "
+          "or SelectedRows.");
+    }
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override {
+    const std::vector<std::string>& inputs = Inputs(in);
+    const std::vector<std::string>& outputs = Outputs(out);
+    PADDLE_ENFORCE_LT(i, inputs.size());
+    PADDLE_ENFORCE_LT(j, outputs.size());
+    Variable* in_var = scope_.FindVar(inputs.at(i));
    if (!in_var->IsType<LoDTensor>()) return;
+    Variable* out_var = scope_.FindVar(outputs.at(j));
    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
    auto in_tensor = in_var->Get<LoDTensor>();
@ -579,20 +612,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
      out_tensor->set_layout(in_tensor.layout());
  }

-  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
-                   size_t j = 0) const {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_layout(in_tensor.layout());
-  }
-
  bool IsRuntime() const override { return true; }

 protected:
@ -663,16 +682,12 @@ static void CheckTensorNANOrInf(const std::string& name,
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-  VLOG(3) << "start Infershape";
  this->InferShape(&infer_shape_ctx);
-  VLOG(3) << "Infershape Pass";
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);

  // check if op[type] has kernel registered.
-  VLOG(3) << "Start Kernels";
  auto& all_op_kernels = AllOpKernels();
-  VLOG(3) << "Kernel map finish";
  auto kernels_iter = all_op_kernels.find(type_);
  if (kernels_iter == all_op_kernels.end()) {
    PADDLE_THROW(
@ -690,7 +705,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,

  auto expected_kernel_key =
      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(3) << "expected_kernel_key: " << expected_kernel_key;
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@ -56,6 +56,9 @@ class InferShapeContext {
  virtual const std::vector<std::string> &Outputs(
      const std::string &name) const = 0;

+  virtual void ShareDim(const std::string &in, const std::string &out,
+                        size_t i = 0, size_t j = 0) = 0;
+
  virtual void ShareLoD(const std::string &in, const std::string &out,
                        size_t i = 0, size_t j = 0) const = 0;

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -112,11 +112,11 @@ bool NativePaddlePredictor::Init(

    auto &block = inference_program_->Block(0);
    for (auto *op_desc : block.AllOps()) {
-      if (op_desc->HasAttr("use_cudnn")) {
-        op_desc->SetAttr("use_cudnn", false);
-      }
+      // if (op_desc->HasAttr("use_cudnn")) {
+      //   op_desc->SetAttr("use_cudnn", false);
+      // }
      if (op_desc->HasAttr("workspace_size_MB")) {
-        op_desc->SetAttr("workspace_size_MB", 0);
+        op_desc->SetAttr("workspace_size_MB", 1024);
      }
    }

--- a/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
+++ b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
@ -27,8 +27,8 @@ NativeConfig GetConfig() {
  NativeConfig config;

  // config.model_dir = FLAGS_dirname;
-  config.prog_file = "hs_lb_without_bn/__model__";
-  config.param_file = "hs_lb_without_bn/__params__";
+  config.prog_file = "hs_lb_without_bn_cudnn/__model__";
+  config.param_file = "hs_lb_without_bn_cudnn/__params__";
  // config.prog_file = "hs_lb_without_bn_cuda/__model__";
  // config.param_file = "hs_lb_without_bn_cuda/__params__";
  config.fraction_of_gpu_memory = 0.0;
@ -106,7 +106,7 @@ void test_naive(int batch_size) {
    std::cout << "batch: " << batch_size
              << " predict cost: " << time_diff(time1, time2) / steps << "ms"
              << std::endl;
-    std::cout << outputs.size() << std::endl; 
+    std::cout << outputs.size() << std::endl;
    int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
    int64_t sum_out = 0;
    for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
--- a/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
+++ b/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
@ -21,12 +21,12 @@
 #include <fstream>
 #include <iostream>
 #include <thread>  // NOLINT
+#include <utility>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"

 #define ASSERT_TRUE(x) x
 #define ASSERT_EQ(x, y) assert(x == y)

-
 // DEFINE_string(dirname, "./LB_icnet_model",
 //               "Directory of the inference model.");
 namespace paddle {
@ -34,7 +34,7 @@ NativeConfig GetConfig() {
  NativeConfig config;
  config.prog_file = "./hs_lb_without_bn_cuda/__model__";
  config.param_file = "./hs_lb_without_bn_cuda/__params__";
-  config.fraction_of_gpu_memory = 0.5;
+  config.fraction_of_gpu_memory = 0.0;
  config.use_gpu = true;
  config.device = 0;
  return config;
@ -54,7 +54,7 @@ void test_naive(int batch_size, std::string model_path) {
  int height = 449;
  int width = 581;
  std::vector<float> data;
-  for(int i=0; i < 3 * height * width; ++i) {
+  for (int i = 0; i < 3 * height * width; ++i) {
    data.push_back(0.0);
  }

@ -86,47 +86,61 @@ void test_naive(int batch_size, std::string model_path) {
  //   in_img.close();
  //   std::cout << "sum: " << sum_n << std::endl;

-    PaddleTensor tensor;
-    tensor.shape = std::vector<int>({batch_size, 3, height, width});
-    tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
-    std::copy(data.begin(), data.end(),
-              static_cast<float*>(tensor.data.data()));
-    tensor.dtype = PaddleDType::FLOAT32;
-    std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-
-    constexpr int num_jobs = 2;  // each job run 1 batch
-    std::vector<std::thread> threads;
-
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
+  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  constexpr int num_jobs = 5;  // each job run 1 batch
+  std::vector<std::thread> threads;
+  // using PtrPred = std::vector<std::unique_ptr<PaddlePredictor>>;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    auto& pred = CreatePaddlePredictor<NativeConfig>(config);
+    predictors.emplace_back(std::move(pred));
+  }

-    for (int tid = 0; tid < num_jobs; ++tid) {
-      threads.emplace_back([&, tid]() {
+  using namespace std::chrono_literals;
+  // std::this_thread::sleep_for(std::chrono::seconds(20));
+  std::cout << "before start predict";
+
+  int epoches = 100000;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+      auto& predictor = predictors[tid];
+      // auto& predictor = predictors[tid];
+      // auto predictor = preds[tid];
+      // std::this_thread::sleep_for(std::chrono::seconds(20));
      PaddleTensor tensor_out;
      std::vector<PaddleTensor> outputs(1, tensor_out);
-        auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-        for (size_t i = 0; i < 1000; i++) {
-          ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
-          VLOG(0) << "tid : " << tid << " run: " << i << "finished";
-          //std::cout <<"tid : " << tid << " run: " << i << "finished" << std::endl;
-          ASSERT_EQ(outputs.size(), 1UL);
-          // int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
-          // int64_t sum_out = 0;
-          // for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t);
-          //      ++j) {
-          //   sum_out += data_o[j];
-          // }
-          // std::cout << "tid : " << tid << "pass : " << i << " " << sum_out
-          //           << std::endl;
-        }
-      });
-    }
-    for (int i = 0; i < num_jobs; ++i) {
-      threads[i].join();
-    }
+      for (size_t i = 0; i < epoches; i++) {
+        ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+        VLOG(0) << "tid : " << tid << " run: " << i << "finished";
+        // std::cout <<"tid : " << tid << " run: " << i << "finished" <<
+        // std::endl;
+        ASSERT_EQ(outputs.size(), 1UL);
+        // int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
+        // int64_t sum_out = 0;
+        // for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t);
+        //      ++j) {
+        //   sum_out += data_o[j];
+        // }
+        // std::cout << "tid : " << tid << "pass : " << i << " " << sum_out
+        //           << std::endl;
+      }
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
  }
+}
 // }
-} // namespace paddle
+}  // namespace paddle

-  int main(int argc, char** argv) { 
-    paddle::test_naive(1 << 0, ""); 
-    return 0;
+int main(int argc, char** argv) {
+  paddle::test_naive(1 << 0, "");
+  return 0;
 }
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@ -11,7 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"

--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/assert.h"
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X", "(Tensor) The input of Topk op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X");
    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
    AddComment(R"DOC(
 Top K operator
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@ -256,65 +256,36 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
 * 3. go to the second setp, until one thread's topk value is null;
 * 4. go to the first setp, until get the topk value.
 */
-
 template <typename T, int MaxLength, int BlockSize>
 __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k,
-                             int grid_dim, int num) {
+                             const T* src, int lds, int dim, int k) {
  __shared__ Pair<T> sh_topk[BlockSize];
+  __shared__ int maxid[BlockSize / 2];
  const int tid = threadIdx.x;
  const int warp = threadIdx.x / 32;
+  output += blockIdx.x * output_stride;
+  indices += blockIdx.x * k;

-  const int bid = blockIdx.x;
-  for (int i = bid; i < num; i += grid_dim) {
-    int top_num = k;
-    __shared__ int maxid[BlockSize / 2];
-    T* out = output + i * output_stride;
-    int64_t* inds = indices + i * k;
-    Pair<T> topk[MaxLength];
-    int beam = MaxLength;
-    Pair<T> max;
-    bool is_empty = false;
-    bool firststep = true;
-
-    for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-INFINITY, -1);
-    }
-    while (top_num) {
-      ThreadGetTopK<T, MaxLength, BlockSize>(
-          topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
+  Pair<T> topk[MaxLength];
+  int beam = MaxLength;
+  Pair<T> max;
+  bool is_empty = false;
+  bool firststep = true;

-      sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
-                                           &beam, &top_num, tid, warp);
-    }
+  for (int k = 0; k < MaxLength; k++) {
+    topk[k].set(-INFINITY, -1);
  }
-}
-
-inline static int GetDesiredBlockDim(int dim) {
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
+  while (k) {
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
+                                           src + blockIdx.x * lds, &firststep,
+                                           &is_empty, &max, dim, tid);
+
+    sh_topk[tid] = topk[0];
+    BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
+                                         &indices, &beam, &k, tid, warp);
  }
 }

-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
 template <typename T>
 class TopkOpCUDAKernel : public framework::OpKernel<T> {
 public:
@ -327,38 +298,30 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
    size_t k = static_cast<int>(ctx.Attr<int>("k"));

    const T* input_data = input->data<T>();
+
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
    // FIXME(typhoonzero): data is always converted to type T?
    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());

-    framework::DDim inputdims = input->dims();
-    const size_t input_height = framework::product(
-        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
-    const size_t input_width = inputdims[inputdims.size() - 1];
-
+    size_t input_height = input->dims()[0];
+    size_t input_width = input->dims()[1];
    if (k > input_width) k = input_width;

    // NOTE: pass lds and dim same to input width.
    // NOTE: old matrix implementation of stride is different to eigen.
    // TODO(typhoonzero): refine this kernel.
-    const int kMaxHeight = 2048;
-    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-    auto& dev_ctx = ctx.cuda_device_context();
-    switch (GetDesiredBlockDim(input_width)) {
-      FIXED_BLOCK_DIM(
-          KeMatrixTopK<T, 5,
-                       kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-              output_data, k, indices_data, input_data, input_width,
-              input_width, static_cast<int>(k), gridx, input_height));
-      default:
-        PADDLE_THROW("Error");
-    }
+    dim3 threads(256, 1);
+    dim3 grid(input_height, 1);
+
+    KeMatrixTopK<T, 5, 256><<<
+        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(
+        output_data, output->dims()[1], indices_data, input_data, input_width,
+        input_width, static_cast<int>(k));
  }
 };

-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@ -34,6 +34,7 @@ class TopkKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    // Get the top k elements of each row of input tensor
+    // FIXME: only deal with matrix(2d tensor).
    auto* input = ctx.Input<Tensor>("X");
    auto* output = ctx.Output<Tensor>("Out");
    auto* indices = ctx.Output<Tensor>("Indices");
@ -43,6 +44,8 @@ class TopkKernel : public framework::OpKernel<T> {
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());

+    auto eg_input = EigenMatrix<T>::From(*input);
+
    // reshape input to a flattern matrix(like flat_inner_dims)
    framework::DDim inputdims = input->dims();
    const size_t row = framework::product(
@ -50,7 +53,7 @@ class TopkKernel : public framework::OpKernel<T> {
    const size_t col = inputdims[inputdims.size() - 1];
    Eigen::DSizes<int, 2> flat2dims(row, col);
    // NOTE: eigen shape doesn't affect paddle tensor.
-    auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
+    eg_input.reshape(flat2dims);

 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -27,6 +27,12 @@ ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

+set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+set(MYDEPS ${MYDEPS} libcmt shlwapi)
+set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX})
+
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)

 cc_library(place SRCS place.cc DEPS enforce boost)
@ -58,6 +64,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 cc_test(init_test SRCS init_test.cc DEPS device_context)

 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+target_link_libraries(cudnn_helper_test ${MYDEPS})
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)


--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@ -68,7 +68,14 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
    }                                                                \
  } while (false)
 #else
-#define CUDNN_ENFORCE(condition)
+// windows
+#define CUDNN_ENFORCE(condition)                                    \
+  do {                                                              \
+    cudnnStatus_t status = condition;                               \
+    if (status != CUDNN_STATUS_SUCCESS) {                           \
+      std::cerr << ::paddle::platform::cudnnGetErrorString(status); \
+    }                                                               \
+  } while (false)
 #endif

 enum class DataLayout {  // Not use
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@ -127,7 +127,7 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
 // there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition == 0)
+#define UNLIKELY(condition) ((condition) == 0)
 #endif

 template <typename... Args>
@ -309,7 +309,6 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)

-
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
  do {                                                       \
    if (UNLIKELY(nullptr == (__VAL))) {                      \
@ -330,26 +329,26 @@ inline void throw_on_error(T e) {
    }                                                                   \
  } while (0)
 #else
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0)==(__VAL1))
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0)!=(__VAL1))
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0)>(__VAL1))
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0)>=(__VAL1))
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0)<(__VAL1))
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0)<=(__VAL1))
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
-  do {  \
-    if (!((__VAL0)__CMP(__VAL1))) { \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed."); \
-    }  \
-  } while(0) 
-#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \
-    do {  \
-    if (nullptr == (__VAL1)) { \
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
+  do {                                                                 \
+    if (!((__VAL0)__CMP(__VAL1))) {                                    \
+      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
+    }                                                                  \
+  } while (0)
+#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
+  do {                                                             \
+    if (nullptr == (__VAL1)) {                                     \
      PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
-    }  \
-  } while(0) 
-#endif // !_WIN32
+    }                                                              \
+  } while (0)
+#endif  // !_WIN32

 }  // namespace platform
 }  // namespace paddle