Merge remote-tracking branch 'upstream/develop' into windows/build

7 years ago · 3a72a634cf
parent 81f750a88c 48be9dc3e1
commit 3a72a634cf
100 changed files with 4394 additions and 1075 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,7 @@ paddle/operators/tensor.save
 python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
 python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
 python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 paddle/fluid/operators/distributed/send_recv.proto
 *.DS_Store
 *.vs
 build/
@ -28,4 +29,5 @@ third_party/
 build_*
 # clion workspace.
 cmake-build-*
 paddle/fluid/operators/distributed/send_recv.proto
 model_test
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@ -30,6 +30,8 @@ class ExceptionHolder {
      Catch(exp);
    } catch (platform::EnforceNotMet exp) {
      Catch(exp);
    } catch (std::exception& ex) {
      LOG(FATAL) << "std::exception caught, " << ex.what();
    } catch (...) {
      LOG(FATAL) << "Unknown exception caught";
    }
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -418,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
                          &(ctx->cur_ref_cnts_));
    }
    if (FLAGS_benchmark) {
      VLOG(20) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
    }
  }
  if (gc != nullptr) {
@ -444,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      scope->DropKids();
    }
  }
  if (FLAGS_benchmark) {
    VLOG(20) << "-------------------------------------------------------";
    VLOG(20) << "Memory used after deleting local scope: "
             << memory::memory_usage(place_);
    VLOG(20) << "-------------------------------------------------------";
  }
 }
 void Executor::RunPreparedContext(
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
@ -15,24 +15,119 @@
 #pragma once
 #include <string>
 #include <tuple>
 #include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include <boost/optional.hpp>
 namespace paddle {
 namespace framework {
 namespace ir {
-class ConvElementwiseAddMKLDNNFusePass : public FusePassBase {
+using graph_ptr = std::unique_ptr<ir::Graph>;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
 bool IsReachable(ir::Graph* graph, Node* from, Node* to);
 boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
 class ResidualConnectionMKLDNNFusePass : public FusePassBase {
 private:
  GraphWithStats FuseConvAsX(const std::string& name_scope,
                             const GraphWithStats& graph_with_stats) const;
  GraphWithStats FuseConvAsY(const std::string& name_scope,
                             const GraphWithStats& graph_with_stats) const;
  GraphWithStats FuseProjectionConv(
      const std::string& name_scope,
      const GraphWithStats& graph_with_stats) const;
  template <typename RetType>
  using GetNodeFunc =
      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
  using IdentityElementwiseAddFunc =
      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
  using ProjectionConvFunc = IdentityConvFunc;
  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
  using CanFuseFunc = std::function<bool(Node*, Node*)>;
  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
      const patterns::Conv& conv_pattern,
      const GraphPatternDetector::subgraph_t& subgraph) const;
  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
      const patterns::Conv& conv_pattern,
      const GraphPatternDetector::subgraph_t& subgraph) const;
  template <typename HandleType, typename... OpFuncs>
  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
                                      const GraphWithStats& graph_with_stats,
                                      OpFuncs&&... op_funcs) const {
    ir::Graph* graph;
    int stats;
    std::tie(graph, stats) = graph_with_stats;
    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
    };
    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
    (*gpd)(graph, fuse_handle);
    return std::make_pair(graph, stats + fuse_handle.get_stats());
  }
  struct IdentityFuseHandle {
    IdentityFuseHandle(
        const CanFuseFunc& can_fuse_func,
        const IdentityConvFunc& get_node_from_conv_op,
        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op);
    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
                    Graph* graph);
    int get_stats() const { return *fusion_stats; }
   private:
    std::shared_ptr<int> fusion_stats;
    CanFuseFunc can_fuse_func;
    IdentityConvFunc get_node_from_conv_op;
    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
  };
  struct ProjectionFuseHandle {
    ProjectionFuseHandle(
        const CanFuseFunc& can_fuse_func,
        const ProjectionConvFunc& get_node_from_conv_x_op,
        const ProjectionConvFunc& get_node_from_conv_y_op,
        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op);
    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
                    Graph* graph);
    int get_stats() const { return *fusion_stats; }
   private:
    std::shared_ptr<int> fusion_stats;
    CanFuseFunc can_fuse_func;
    ProjectionConvFunc get_node_from_conv_x_op;
    ProjectionConvFunc get_node_from_conv_y_op;
    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
  };
 public:
-  virtual ~ConvElementwiseAddMKLDNNFusePass() {}
+  virtual ~ResidualConnectionMKLDNNFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(graph_ptr graph) const;
-  const std::string name_scope_{"residual_connections_fuse_pass"};
+  const std::string name_scope_{"residual_connection_fuse_pass"};
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@ -40,7 +40,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
  op->SetOutput(output.first, {output.second});
 }
-struct IsReachable {
+struct TestIsReachable {
  using func = std::function<bool(const std::string&, const std::string&)>;
  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
@ -89,7 +89,9 @@ struct IsReachable {
  }
 };
-void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
+void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph,
                    int expected_conv_count,
                    int expected_elementwise_add_count = 0) {
  int conv_count = 0;
  int elementwise_add_count = 0;
@ -101,8 +103,8 @@ void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
      ++elementwise_add_count;
    }
  }
-  EXPECT_EQ(conv_count, 1);
+  EXPECT_EQ(conv_count, expected_conv_count);
-  EXPECT_EQ(elementwise_add_count, 0);
+  EXPECT_EQ(elementwise_add_count, expected_elementwise_add_count);
 }
 ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
@ -127,22 +129,13 @@ ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
  return prog;
 }
 }  // namespace
 TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
  auto prog =
      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
  SetOp(&prog, "conv2d",
        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
        {"Output", "b"});
  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+void RunPassAndAssert(ProgramDesc* prog, const std::string& from,
                      const std::string& to, int expected_conv_num) {
  std::unique_ptr<ir::Graph> graph(new ir::Graph(*prog));
-  IsReachable is_reachable;
+  TestIsReachable is_reachable;
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_TRUE(is_reachable(graph)(from, to));
  auto pass =
      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
@ -150,82 +143,87 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_TRUE(is_reachable(graph)(from, to));
  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
            current_nodes_num);
-  AssertOpsCount(graph);
+  AssertOpsCount(graph, expected_conv_num);
 }
 }  // namespace
-TEST(ConvElementwiseAddMKLDNNFusePass,
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
-     ConvolutionWithElementwiseAddReluNoBias) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
  SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
        {"Output", "b"});
  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  IsReachable is_reachable;
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
  SetOp(&prog, "conv2d",
        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
        {"Output", "c"});
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  auto pass =
+  RunPassAndAssert(&prog, "a", "relu", 1);
-      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+}
  int original_nodes_num = graph->Nodes().size();
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+TEST(ConvElementwiseAddMKLDNNFusePass,
     ConvolutionAsYWithElementwiseAddReluNoBias) {
  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-            current_nodes_num);
+  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
        {"Output", "c"});
  SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  AssertOpsCount(graph);
+  RunPassAndAssert(&prog, "a", "relu", 1);
 }
-TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) {
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"});
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
  SetOp(&prog, "conv2d",
-        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
-        {"Output", "b"});
+        {"Output", "c"});
  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  IsReachable is_reachable;
+  RunPassAndAssert(&prog, "a", "relu", 1);
-  EXPECT_TRUE(is_reachable(graph)("a", "d"));
+}
-  auto pass =
+TEST(ConvElementwiseAddMKLDNNFusePass,
-      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+     ConvolutionAsXWithElementwiseAddReluNoBias) {
-  int original_nodes_num = graph->Nodes().size();
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();
-  EXPECT_FALSE(is_reachable(graph)("a", "d"));
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
        {"Output", "c"});
  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+  RunPassAndAssert(&prog, "a", "relu", 1);
            current_nodes_num);
  AssertOpsCount(graph);
 }
-TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
+TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
  auto prog =
-      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"});
  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  SetOp(&prog, "conv2d",
+  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
        {"Output", "c"});
  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
  SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  SetOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}},
        {"Output", "e"});
-  IsReachable is_reachable;
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {"Out", "f"});
  SetOp(&prog, "relu", {{"X", "f"}}, {"Out", "g"});
-  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
  TestIsReachable is_reachable;
  EXPECT_TRUE(is_reachable(graph)("a", "g"));
  auto pass =
      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
@ -233,11 +231,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();
-  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  EXPECT_TRUE(is_reachable(graph)("a", "g"));
  EXPECT_EQ(original_nodes_num, current_nodes_num);
-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+  AssertOpsCount(graph, 2, 1);
            current_nodes_num);
  AssertOpsCount(graph);
 }
 }  // namespace ir
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -1084,16 +1084,12 @@ PDNode *patterns::Conv::operator()() {
  return output_var;
 }
-PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) {
+PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
                                ->assert_is_op("elementwise_add");
-  x_var->assert_is_op_input("elementwise_add", "X");
+  x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
-
+  y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
  auto y_var = pattern->NewNode(elementwise_add_x_repr())
                   ->AsInput()
                   ->assert_is_op_input("elementwise_add", "Y");
  auto out_var = pattern->NewNode(elementwise_add_out_repr())
                     ->AsOutput()
                     ->assert_is_op_output("elementwise_add", "Out");
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -664,7 +664,7 @@ struct ElementwiseAdd : public PatternBase {
  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "elementwise_add") {}
-  PDNode* operator()(PDNode* x_var);
+  PDNode* operator()(PDNode* x_var, PDNode* y_var);
  PATTERN_DECL_NODE(elementwise_add_op);
  PATTERN_DECL_NODE(elementwise_add_x);
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
 public:
  LoDTensor() : Tensor() {}
  /* Constructor with place should only be used in pybind */
  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
  void set_lod(const LoD& lod) { lod_ = lod; }
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "glog/logging.h"
@ -31,46 +32,6 @@ namespace paddle {
 namespace framework {
 #if defined(PADDLE_WITH_CUDA)
 namespace details {
 struct CUDABuffer {
  void *data_{nullptr};
  size_t size_{0};
  platform::CUDAPlace place_;
  CUDABuffer() {}
  CUDABuffer(platform::Place place, size_t size)
      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
    data_ = memory::Alloc(place_, size);
  }
  ~CUDABuffer() { ClearMemory(); }
  CUDABuffer(const CUDABuffer &o) = delete;
  CUDABuffer &operator=(const CUDABuffer &o) = delete;
  void Resize(platform::Place place, size_t size) {
    ClearMemory();
    place_ = boost::get<platform::CUDAPlace>(place);
    data_ = memory::Alloc(place_, size);
    PADDLE_ENFORCE_NOT_NULL(data_);
    size_ = size;
  }
  void Swap(CUDABuffer &o) {
    std::swap(data_, o.data_);
    std::swap(place_, o.place_);
    std::swap(size_, o.size_);
  }
 private:
  void ClearMemory() const {
    if (data_ != nullptr) {
      memory::Free(place_, data_);
    }
  }
 };
 }  // namespace details
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@ -103,8 +64,6 @@ class Vector {
      o.ImmutableCPU();
      cpu_ = o.cpu_;
      flag_ = kDataInCPU;
      details::CUDABuffer null;
      gpu_.Swap(null);
      return *this;
    }
@ -199,7 +158,7 @@ class Vector {
      PADDLE_ENFORCE(platform::is_gpu_place(place),
                     "CUDA Data must on CUDA place");
      ImmutableCUDA(place);
-      return reinterpret_cast<T *>(gpu_.data_);
+      return reinterpret_cast<T *>(gpu_->ptr());
    }
    // get cuda ptr. mutable
@ -234,13 +193,11 @@ class Vector {
    std::mutex &Mutex() const { return mtx_; }
-    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
+    boost::optional<platform::CUDAPlace> CUDAPlace() const {
-      if (gpu_.data_ == nullptr) {
+      return gpu_ == nullptr
-        return nullptr;
+                 ? boost::none
-      } else {
+                 : boost::optional<platform::CUDAPlace>(
-        return std::unique_ptr<platform::CUDAPlace>(
+                       boost::get<platform::CUDAPlace>(gpu_->place()));
            new platform::CUDAPlace(gpu_.place_));
      }
    }
   private:
@ -254,13 +211,12 @@ class Vector {
    void CopyToCPU() const {
      // COPY GPU Data To CPU
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(
+          platform::DeviceContextPool::Instance().Get(gpu_->place()));
              platform::Place(gpu_.place_)));
      auto stream = dev_ctx->stream();
-      void *src = gpu_.data_;
+      void *src = gpu_->ptr();
      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                   stream);
+                   gpu_->size(), stream);
      dev_ctx->Wait();
    }
@ -277,8 +233,7 @@ class Vector {
          CopyCPUDataToCUDA(place);
          UnsetFlag(kDirty);
          SetFlag(kDataInCUDA);
-        } else if (IsInCUDA() &&
+        } else if (IsInCUDA() && !(place == gpu_->place())) {
                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
          PADDLE_THROW("This situation should not happen");
          // Still dirty
        } else {
@ -290,7 +245,7 @@ class Vector {
          // Even data is not dirty. However, data is not in CUDA. Copy data.
          CopyCPUDataToCUDA(place);
          SetFlag(kDataInCUDA);
-        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+        } else if (!(place == gpu_->place())) {
          PADDLE_THROW("This situation should not happen.");
        } else {
          // Not Dirty && DataInCUDA && Device is same
@ -301,13 +256,13 @@ class Vector {
    void CopyCPUDataToCUDA(const platform::Place &place) const {
      void *src = cpu_.data();
-      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
-      void *dst = gpu_.data_;
+      void *dst = gpu_->ptr();
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          platform::DeviceContextPool::Instance().Get(place));
      auto stream = dev_ctx->stream();
-      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                   stream);
+                   gpu_->size(), stream);
    }
    void ImmutableCPU() const {
@ -329,7 +284,7 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }
    mutable std::vector<T> cpu_;
-    mutable details::CUDABuffer gpu_;
+    mutable memory::AllocationPtr gpu_;
    mutable int flag_;
    mutable std::mutex mtx_;
@ -428,8 +383,8 @@ class Vector {
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
+      if (cuda_place == boost::none ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
        return m_.Data().CUDAData(place);
      }
    }
@ -444,8 +399,8 @@ class Vector {
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
+      if (cuda_place == boost::none ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
        return m_.MutableData()->CUDAMutableData(place);
      }
    }
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -32,10 +32,9 @@ size_t Tensor::memory_size() const {
 }
 void* Tensor::mutable_data(platform::Place place, std::type_index type,
                           memory::Allocator::Attr attr,
                           size_t requested_size) {
-  if (holder_ != nullptr) {
+  type_ = type;
    holder_->set_type(type);
  }
  PADDLE_ENFORCE_GE(numel(), 0,
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
+    holder_ = memory::AllocShared(place, size, attr);
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place) ||
               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW(
          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
    }
 #else
      if (platform::is_gpu_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
            boost::get<platform::CUDAPlace>(place), size, type));
      } else if (platform::is_cuda_pinned_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
            boost::get<platform::CUDAPinnedPlace>(place), size, type));
      }
    }
 #endif
    offset_ = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
-void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
                           size_t requested_size) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type(), requested_size);
+  return mutable_data(place, type_, attr, requested_size);
 }
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
    Tensor dst;
    dst.holder_ = holder_;
    dst.set_layout(layout_);
    dst.type_ = type_;
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
    dst.Resize(dst_dims);
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -67,12 +67,7 @@ class Tensor {
  friend struct EigenVector;
 public:
-  Tensor() : offset_(0) {}
+  Tensor() : type_(typeid(float)), offset_(0) {}
  /*! Constructor with place should only be used in pybind. */
  explicit Tensor(const platform::Place& place) : offset_(0) {
    holder_->set_place(place);
  }
  /*! Return a pointer to mutable memory block. */
  template <typename T>
@ -89,12 +84,17 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(platform::Place place, size_t requested_size = 0);
+  T* mutable_data(platform::Place place,
                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
                  size_t requested_size = 0);
  void* mutable_data(platform::Place place, std::type_index type,
                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                     size_t requested_size = 0);
-  void* mutable_data(platform::Place place, size_t requested_size = 0);
+  void* mutable_data(platform::Place place,
                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                     size_t requested_size = 0);
  /**
   * @brief     Return a pointer to mutable memory block.
@ -106,7 +106,9 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
+  T* mutable_data(DDim dims, platform::Place place,
                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
                  size_t requested_size = 0);
  /*! Return the dimensions of the memory block. */
  const DDim& dims() const;
@ -139,7 +141,7 @@ class Tensor {
  std::type_index type() const {
    PADDLE_ENFORCE_NOT_NULL(
        holder_, "Tensor not initialized yet when Tensor::type() is called.");
-    return holder_->type();
+    return type_;
  }
  // memory size returns the holding memory size in byte.
@ -153,56 +155,13 @@ class Tensor {
  void clear() { holder_ = nullptr; }
- private:
+  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
-  /**
+  size_t offset() const { return offset_; }
   * @note    Placeholder hides type T, so it doesn't appear as a template
   *          parameter of Variable.
   */
  struct Placeholder {
    virtual ~Placeholder() = default;
    virtual void* ptr() const = 0;
    virtual size_t size() const = 0;
    virtual std::type_index type() const = 0;
    virtual platform::Place place() const = 0;
    virtual void set_type(std::type_index type) = 0;
    virtual void set_place(platform::Place place) = 0;
  };
  template <typename Place>
  struct PlaceholderImpl : public Placeholder {
    PlaceholderImpl(Place place, size_t size, std::type_index type)
        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
               memory::PODDeleter<uint8_t, Place>(place)),
          place_(place),
          size_(size),
          type_(type) {
      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                              (is_cpu_place(place_) ? "CPU" : "GPU"));
    }
    virtual size_t size() const { return size_; }
    virtual platform::Place place() const { return place_; }
    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual std::type_index type() const { return type_; }
    virtual void set_type(std::type_index type) { type_ = type; }
    virtual void set_place(platform::Place place) { place_ = place; }
    /*! the pointer of memory block. */
    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
    /*! the place of memory block. */
    platform::Place place_;
    /*! the size of memory block. */
    size_t size_;
    /* the current type of memory */
    std::type_index type_;
  };
 private:
  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
+  std::shared_ptr<memory::Allocation> holder_;
-
+  std::type_index type_;
  /**
   * @brief points to elements dimensions.
   *
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -23,10 +23,10 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
  check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
+  bool valid =
-               holder_->type() == std::type_index(typeid(T));
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
  return reinterpret_cast<const T*>(
      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@ -37,26 +37,30 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
  check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
+  bool valid =
-               holder_->type() == std::type_index(typeid(T));
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                              offset_);
 }
 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place,
                               memory::Allocator::Attr attr,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place, requested_size);
+  return mutable_data<T>(place, attr, requested_size);
 }
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+inline T* Tensor::mutable_data(platform::Place place,
                               memory::Allocator::Attr attr,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
+  return reinterpret_cast<T*>(
      mutable_data(place, typeid(T), attr, requested_size));
 }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@ -379,7 +379,9 @@ TEST(Tensor, FromAndToStream) {
    TensorToStream(oss, gpu_tensor, gpu_ctx);
    std::istringstream iss(oss.str());
-    TensorFromStream(iss, &dst_tensor, gpu_ctx);
+    TensorFromStream(
        iss, &dst_tensor,
        *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 6; ++i) {
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@ -1,15 +1,12 @@
 add_subdirectory(detail)
-
+add_subdirectory(allocation)
-cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
+cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
 cc_library(memcpy SRCS memcpy.cc DEPS place)
 cc_library(memory
        DEPS
        malloc
        memcpy)
 cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@ -0,0 +1,64 @@
 cc_library(allocator SRCS allocator.cc DEPS place)
 cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
 endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 if (WITH_GPU)
    nv_test(best_fit_allocator_test
            SRCS best_fit_allocator_test.cc
                 best_fit_allocator_test.cu
            DEPS best_fit_allocator
                 locked_allocator
                 cpu_allocator
                 cuda_allocator
                 device_context
                 memcpy)
 else()
    cc_test(best_fit_allocator_test
            SRCS best_fit_allocator_test.cc
            DEPS best_fit_allocator
                 locked_allocator
                 cpu_allocator)
 endif()
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
 else ()
    set(AllocatorFacadeDeps)
 endif()
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
 cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
 cc_library(allocator_facade SRCS allocator_facade.cc DEPS
        ${AllocatorFacadeDeps}
        cpu_allocator
        locked_allocator
        best_fit_allocator
        aligned_allocator
        auto_increment_allocator
        zero_size_allocator
        conditional_allocator
        retry_allocator
        buffered_allocator
        allocator_strategy
        legacy_allocator
        )
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
 cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@ -0,0 +1,31 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 ThinAlignedAllocator::ThinAlignedAllocator(
    std::shared_ptr<Allocator> underlyning_allocator)
    : underlying_allocator_(std::move(underlyning_allocator)) {}
 bool ThinAlignedAllocator::IsAllocThreadSafe() const {
  return underlying_allocator_->IsAllocThreadSafe();
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@ -0,0 +1,100 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 // The aligned allocation and allocator will wrap a managed allocator,
 // and returns the aligned pointer.
 //
 // NOTE(yy): For speed reason, I just use a template parameter to get
 // alignment, however, it can be an private member if necessary.
 //
 // NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added.
 template <size_t kAlignment>
 class AlignedAllocation : public Allocation {
  static_assert(kAlignment > 0 && (kAlignment & (kAlignment - 1)) == 0,
                "kAlignment must be 2^N");
 public:
  AlignedAllocation(AllocationPtr&& underlying_allocation, size_t size)
      : Allocation(AlignedPtr(underlying_allocation->ptr()),
                   size + kAlignment - Offset(underlying_allocation->ptr()),
                   underlying_allocation->place()),
        underlying_allocation_(std::move(underlying_allocation)) {}
 private:
  static void* AlignedPtr(void* ptr) {
    return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) +
                                   Offset(ptr));
  }
  // Offset to aligned pointer.
  // if ptr is already aligned, returns 0.
  static size_t Offset(void* ptr) {
    auto ptr_addr = reinterpret_cast<intptr_t>(ptr);
    intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1));
    intptr_t diff = aligned_addr - ptr_addr;
    if (diff == 0) {
      return 0;
    } else {
      return kAlignment + diff;
    }
  }
  AllocationPtr underlying_allocation_;
 };
 // Thin aligned allocator is trivial and used to generate a small size binary.
 //
 // NOTE(yy): This is a trick to make a template class. This class extract the
 // common code into a `thin` class. So if there are multiple specification of
 // the template class, the binary size will not extended too much.
 //
 // NOTE(yy): This could be an over design. If it harms readability of code, it
 // could be removed later.
 class ThinAlignedAllocator : public Allocator {
 public:
  explicit ThinAlignedAllocator(
      std::shared_ptr<Allocator> underlyning_allocator);
  bool IsAllocThreadSafe() const;
 protected:
  std::shared_ptr<Allocator> underlying_allocator_;
 };
 // An aligned allocator will allocate `size+kAlignment` allocation and adjust
 // the pointer offset.
 template <size_t kAlignment>
 class AlignedAllocator : public ThinAlignedAllocator {
 public:
  using ThinAlignedAllocator::ThinAlignedAllocator;
 protected:
  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
    auto raw_allocation =
        underlying_allocator_->Allocate(size + kAlignment, attr);
    return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
  }
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
+++ b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
@ -0,0 +1,48 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 // NOTE(yy): this unittest is not important. It just used for debugging.
 // It can be removed later.
 struct FillZero {
 public:
  float* ptr_;
  __device__ void operator()(size_t i) { ptr_[i] = 0.0f; }
 };
 namespace paddle {
 TEST(Eigen, main) {
  framework::Tensor tensor;
  platform::CUDAPlace gpu(0);
  float* ptr = tensor.mutable_data<float>({10, 10}, gpu);
  auto& dev_ctx = *reinterpret_cast<platform::CUDADeviceContext*>(
      platform::DeviceContextPool::Instance().Get(gpu));
  PADDLE_ENFORCE(cudaMemset(ptr, 0, sizeof(float) * 100));
  platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, 100);
  for_range(FillZero{ptr});
  dev_ctx.Wait();
  auto eigen_vec = framework::EigenVector<float>::Flatten(tensor);
  auto& eigen_dev = *dev_ctx.eigen_device();
  eigen_vec.device(eigen_dev) = eigen_vec.constant(0.0f);
 }
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocation_with_underlying.h
+++ b/paddle/fluid/memory/allocation/allocation_with_underlying.h
@ -0,0 +1,33 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "paddle/fluid/memory/allocation/allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 class AllocationWithUnderlying : public Allocation {
 public:
  explicit AllocationWithUnderlying(AllocationPtr allocation)
      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
        allocation_(std::move(allocation)) {}
  AllocationPtr allocation_;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@ -0,0 +1,45 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include <functional>
 namespace paddle {
 namespace memory {
 namespace allocation {
 Allocation::~Allocation() {}
 Allocator::~Allocator() {}
 bool Allocator::IsAllocThreadSafe() const { return false; }
 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
  auto ptr = AllocateImpl(size, attr);
  ptr->set_allocator(this);
  return AllocationPtr(ptr);
 }
 void Allocator::Free(Allocation* allocation) { delete allocation; }
 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
 void AllocationDeleter::operator()(Allocation* allocation) const {
  auto* allocator = allocation->allocator();
  allocator->Free(allocation);
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@ -0,0 +1,145 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include <string>
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 // Exception when `Alloc`/`AllocShared` failed
 class BadAlloc : public std::exception {
 public:
  explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
  const char* what() const noexcept override;
 private:
  std::string msg_;
 };
 class Allocation;
 class AllocationDeleter {
 public:
  void operator()(Allocation* allocation) const;
 };
 class Allocator;
 // Allocation is the object holding the actually pointer. Use
 // `Allocation::ptr()` will returns the pointer that allocated.
 //
 // NOTE: this is the base class of Allocation. Each allocator can use its own
 //       allocation object.
 // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
 class Allocation {
 public:
  Allocation(void* ptr, size_t size, platform::Place place)
      : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
  Allocation(const Allocation& o) = delete;
  Allocation& operator=(const Allocation& o) = delete;
  // Returns the holding pointer.
  // NOTE: For performance consideration, it is better not to make this method
  // as a virtual method. If we want to implement a `defragmentation` later,
  // we might need to make `ptr_` field as a protected field, and add a virtual
  // method like `defragmentation` to change `ptr_`.
  void* ptr() const { return ptr_; }
  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
  // last valid element.
  //
  // NOTE: Some allocator might alloc more memory than request. The size
  // could larger than its request. For example,
  //    the AlignedAllocator will always allocate memory as size + kAlignment.
  //    The raw pointer might not aligned, so an offset might be added to raw
  //    the pointer. The size of this allocation will be
  //    `size + kAlignemnt - offset`.
  size_t size() const { return size_; }
  const platform::Place& place() const { return place_; }
  Allocator* allocator() { return allocator_; }
  void set_allocator(Allocator* allocator) { allocator_ = allocator; }
  virtual ~Allocation();
 private:
  Allocator* allocator_;
  void* ptr_;
  size_t size_;
  platform::Place place_;
 };
 using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
 // Base interface class of memory Allocator.
 // To allocate a memory, allocator needs two parameters:
 //    1. size of bytes.
 //    2. Attribute of memory.
 // NOTE: the attribute of memory might be ignored if the allocator does not
 // care it.
 class Allocator {
 public:
  enum Attr {
    kDefault = 0,  // Default attribute. Uses the fast or stablest allocation
                   // algorithm.
    kFixedHuge = 1,  // The allocation may not be freed until the program
                     // ends. e.g., `Parameters` and `Momentum`.
    kFluxHuge = 2,  // The allocation may create and freed frequently and the
                    // allocation is considerable huge. Like `activations`
                    // and gradients.
    kScratchpad =
        3,  // The `Scratchpad` memory is allocated and freed very soon,
            // usually within an operator or aux memory.
            // Like CUDNN workspace, AUX memory in batch norm, etc.
            //
            // https://en.wikipedia.org/wiki/Scratchpad_memory
    kCrossDevice =
        4,  // The memory used cross-device memory copy/communication.
            // For example:
            // 1. it can use an `pinned` memory for CPU-GPU
            //    communication.
            // 2. it can use an `registered` memory for RDMA
            //    communication.
    NumOfAttrs = 5  // The number of all attributes. It is used internally.
  };
  virtual ~Allocator();
  // Allocate an allocation.
  AllocationPtr Allocate(size_t size, Allocator::Attr attr = kDefault);
  // True if the `Allocate` is thread safe.
  virtual bool IsAllocThreadSafe() const;
 protected:
  virtual void Free(Allocation* allocation);
  virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
 private:
  friend class AllocationDeleter;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@ -0,0 +1,57 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
 // be hidden behind this facade.
 //
 // NOTE(yy): This class is a singleton class.
 // NOTE(yy): To create a stable ABI and make compilation faster. Here we use
 // a Pimpl trick;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
 public:
  ~AllocatorFacade();
  AllocatorFacade(const AllocatorFacade& o) = delete;
  const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
  static AllocatorFacade& Instance();
  // Allocate a shared allocation.
  std::shared_ptr<Allocation> AllocShared(
      const platform::Place& place, size_t size,
      Allocator::Attr attr = Allocator::kDefault);
  // Allocate a unique allocation.
  AllocationPtr Alloc(const platform::Place& place, size_t size,
                      Allocator::Attr attr = Allocator::kDefault);
  // TODO(yy): Allocate a Copy-On-Write allocation?
 private:
  AllocatorFacade();
  AllocatorFacadePrivate* m_;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc
@ -0,0 +1,87 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 DECLARE_int64(gpu_allocator_retry_time);
 #endif
 namespace paddle {
 namespace memory {
 namespace allocation {
 TEST(allocator, allocator) {
 #ifdef PADDLE_WITH_CUDA
  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
  FLAGS_gpu_allocator_retry_time = 500;
  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
 #endif
  auto &instance = AllocatorFacade::Instance();
  platform::Place place;
  size_t size = 1024;
  {
    place = platform::CPUPlace();
    size = 1024;
    auto cpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(cpu_allocation, nullptr);
    ASSERT_NE(cpu_allocation->ptr(), nullptr);
    ASSERT_EQ(cpu_allocation->place(), place);
    ASSERT_EQ(cpu_allocation->size(), size);
  }
 #ifdef PADDLE_WITH_CUDA
  {
    place = platform::CUDAPlace(0);
    size = 1024;
    auto gpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(gpu_allocation, nullptr);
    ASSERT_NE(gpu_allocation->ptr(), nullptr);
    ASSERT_EQ(gpu_allocation->place(), place);
    ASSERT_GE(gpu_allocation->size(), size);
  }
  {
    // Allocate 2GB gpu memory
    place = platform::CUDAPlace(0);
    size = 2 * static_cast<size_t>(1 << 30);
    auto gpu_allocation = instance.Alloc(place, size);
    ASSERT_NE(gpu_allocation, nullptr);
    ASSERT_NE(gpu_allocation->ptr(), nullptr);
    ASSERT_EQ(gpu_allocation->place(), place);
    ASSERT_GE(gpu_allocation->size(), size);
  }
  {
    place = platform::CUDAPinnedPlace();
    size = (1 << 20);
    auto cuda_pinned_allocation =
        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
    ASSERT_NE(cuda_pinned_allocation, nullptr);
    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
    ASSERT_EQ(cuda_pinned_allocation->place(), place);
    ASSERT_GE(cuda_pinned_allocation->size(), size);
  }
 #endif
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/Show More
+++ b/Show More