Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imdb_data

8 years ago · 17f4438413
parent 26c9e8e70b ea4bdca86c
commit 17f4438413
198 changed files with 3032 additions and 1420 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,8 +16,6 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

 include(system)

@ -201,6 +199,10 @@ if(WITH_GOLANG)
 endif(WITH_GOLANG)

 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
+
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
  add_subdirectory(python)
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@ -6,8 +6,18 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
    "train.list", None, module="provider", obj="process", args=args)

@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)

 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)

@ -40,11 +50,11 @@ net = img_conv_layer(
    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)

 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)

 net = fc_layer(
@ -59,6 +69,9 @@ net = fc_layer(
    layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())

-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(net)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
    else:
        settings.data_size = settings.height * settings.width
    settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
    if settings.is_infer:
        settings.slots = [dense_vector(settings.data_size)]
    else:
@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
@provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(2560 if settings.is_infer else 1024):
+    for i in xrange(settings.num_samples):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
        if settings.is_infer:
            yield img.astype('float32')
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@ -37,7 +37,7 @@ function infer() {
      --trainer_count=1 \
      --num_passes=1 \
      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
      > /dev/null 2>&1
    echo "Done"
  fi
@ -79,8 +79,9 @@ fi
 # inference benchmark
 for use_mkldnn in True False; do
  for batchsize in 1 2 4 8 16; do
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
    infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@ -47,5 +47,6 @@ for use_mkldnn in True False; do
    train vgg 19 $batchsize $use_mkldnn
    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@ -23,24 +23,25 @@ function infer() {
    echo "./run_mkl_infer.sh to save the model first"
    exit 0
  fi
-  log_period=$((256 / bs))
+  log_period=$((32 / bs))
  paddle train --job=test \
    --config="${topology}.py" \
+    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
    --init_model_path=$models_in \
    2>&1 | tee ${log}

-  # calculate the last 5 logs period time of 1280 samples,
+  # calculate the last 5 logs period time of 160(=32*5) samples,
  # the time before are burning time.
  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  start_sec=`clock_to_seconds $start`
  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }

@ -56,7 +57,8 @@ fi

 # inference benchmark
 for batchsize in 1 2 4 8 16; do
-  infer googlenet v1 $batchsize
-  infer resnet 50 $batchsize
  infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@ -12,10 +12,11 @@ function train() {
  config="${topology}.py"
  paddle train --job=time \
    --config=$config \
+    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
+    --log_period=3 \
+    --test_period=30 \
    --config_args=$args \
    2>&1 | tee ${log} 

@ -36,4 +37,5 @@ for batchsize in 64 128 256; do
  train vgg 19 $batchsize
  train resnet 50 $batchsize
  train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@ -170,6 +170,18 @@ sequence_pool
    :noindex:


+sequence_first_step
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+
+
+sequence_last_step
+------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+
+
 pool2d
 ------
 ..  autofunction:: paddle.v2.fluid.layers.pool2d
@ -318,3 +330,9 @@ reduce_sum
 ..  autofunction:: paddle.v2.fluid.layers.reduce_sum
    :noindex:

+
+reduce_mean
+---------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+    :noindex:
+
--- a/doc/design/block.md
+++ b/doc/design/block.md
@ -291,10 +291,10 @@ public:
  }

  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
    for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
    }
  }

--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@ -25,13 +25,14 @@ There are mainly three parts that we have to consider while integrating a new de

 ### Place and DeviceContext

+Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.

 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.

 ```
-        |   CPUPlace   --> MKLDNNPlace
-Place --|   CUDAPlace  --> CUDNNPlace
+        |   CPUPlace
+Place --|   CUDAPlace
        |   FPGAPlace
 ```

@ -43,7 +44,7 @@ typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;

 #### DeviceContext

-Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.


 ```
@ -106,7 +107,7 @@ template <typename Place>
 size_t Used(Place place);
 ```

-To implementing these interfaces, we have to implement MemoryAllocator for different Devices
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.


 #### Tensor
@ -243,6 +244,7 @@ REGISTER_OP_CUDA_KERNEL(
 Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.


-We will discuss how to implement an efficient OpKernel switch policy. 
+For more details, please refer to following docs:

- TBD
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其
   :header: "依赖", "版本", "说明"
   :widths: 10, 15, 30

-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
+   "Go", ">=1.8", "可选"


 .. _build_options:
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@ -76,13 +76,13 @@ will be downloaded automatically.
   :header: "Dependency", "Version", "Description"
   :widths: 10, 15, 30

-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
-   "Python", "2.7.x", "Need libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "Optional"
+   "Go", ">=1.8", "Optional"


 .. _build_options:
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@ -59,5 +59,8 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

-cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
+cc_test(threadpool_test SRCS threadpool_test.cc)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
 cc_test(init_test SRCS init_test.cc DEPS init)
+
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context)
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@ -90,6 +90,21 @@ OpDesc *BlockDesc::PrependOp() {
  return ops_.front().get();
 }

+void BlockDesc::RemoveOp(size_t s, size_t e) {
+  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+    return;
+  }
+  need_update_ = true;
+  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
+    auto names = (*it)->InputArgumentNames();
+    for (auto n : names) {
+      // TODO(typhoonzero): delete vars if no other op use it.
+      VLOG(3) << "deleting var " << n;
+    }
+  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
+}
+
 std::vector<OpDesc *> BlockDesc::AllOps() const {
  std::vector<OpDesc *> res;
  for (const auto &op : ops_) {
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@ -79,6 +79,8 @@ class BlockDesc {

  OpDesc *PrependOp();

+  void RemoveOp(size_t s, size_t e);
+
  std::vector<OpDesc *> AllOps() const;

  size_t OpSize() const { return ops_.size(); }
--- a/paddle/framework/data_layout.h
+++ b/paddle/framework/data_layout.h
@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+enum DataLayout {
+  kNHWC = 0,
+  kNCHW = 1,
+  kAnyLayout = 2,
+};
+
+inline DataLayout StringToDataLayout(const std::string& str) {
+  if (str == "NHWC" || str == "nhwc") {
+    return DataLayout::kNHWC;
+  } else if (str == "NCHW" || str == "nchw") {
+    return DataLayout::kNCHW;
+  } else {
+    PADDLE_THROW("Unknown storage order string: %s", str);
+  }
+}
+
+inline std::string DataLayoutToString(const DataLayout& data_layout) {
+  switch (data_layout) {
+    case kNHWC:
+      return "NHWC";
+    case kNCHW:
+      return "NCHW";
+    case kAnyLayout:
+      return "ANY_LAYOUT";
+    default:
+      PADDLE_THROW("unknown DataLayou %d", data_layout);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, DataLayout l) {
+  out << DataLayoutToString(l);
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@ -33,13 +33,7 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";

-DeviceContextPool* DeviceContextPool::pool = nullptr;
-
-Executor::Executor(const std::vector<platform::Place>& places) {
-  DeviceContextPool& pool = DeviceContextPool::Get();
-  auto borrowed_contexts = pool.Borrow(places);
-  device_contexts_.swap(borrowed_contexts);
-}
+Executor::Executor(const platform::Place& place) : place_(place) {}

 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
  if (var_type == proto::VarDesc::LOD_TENSOR) {
@ -65,47 +59,48 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
 }

 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope) {
+                   bool create_local_scope, bool create_vars) {
  // TODO(tonyyang-svail):
  //    - only runs on the first device (i.e. no interdevice communication)
  //    - will change to use multiple blocks for RNN op and Cond Op
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
  auto& block = pdesc.Block(block_id);
-  auto& device = device_contexts_[0];

  Scope* local_scope = scope;
-  if (create_local_scope) {
-    local_scope = &scope->NewScope();
-    for (auto& var : block.AllVars()) {
-      if (var->Name() == framework::kEmptyVarName) {
-        continue;
+  if (create_vars) {
+    if (create_local_scope) {
+      local_scope = &scope->NewScope();
+      for (auto& var : block.AllVars()) {
+        if (var->Name() == framework::kEmptyVarName) {
+          continue;
+        }
+
+        if (var->Persistable()) {
+          auto* ptr = scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " global, which pointer is " << ptr;
+        } else {
+          auto* ptr = local_scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " locally, which pointer is " << ptr;
+        }
      }
-
-      if (var->Persistable()) {
-        auto* ptr = scope->Var(var->Name());
-        CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
-      } else {
+    } else {
+      for (auto& var : block.AllVars()) {
        auto* ptr = local_scope->Var(var->Name());
        CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
+        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+                << ptr;
      }
-    }
-  } else {
-    for (auto& var : block.AllVars()) {
-      auto* ptr = local_scope->Var(var->Name());
-      CreateTensor(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
-    }
-  }
+    }  // if (create_local_scope)
+  }    // if (create_vars)

  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
    VLOG(3) << op->DebugString();
-    op->Run(*local_scope, *device);
+    op->Run(*local_scope, place_);
  }
  if (create_local_scope) {
    scope->DeleteScope(local_scope);
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@ -14,9 +14,6 @@ limitations under the License. */

 #pragma once

-#include <map>
-#include <unordered_map>
-
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
@ -26,86 +23,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-class DeviceContextPool {
- public:
-  static DeviceContextPool& Get() {
-    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
-    return *pool;
-  }
-
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
-    if (pool == nullptr) {
-      pool = new DeviceContextPool(places);
-    }
-    return *pool;
-  }
-
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-    std::vector<const platform::DeviceContext*> borrowed_contexts;
-    for (auto& place : places) {
-      auto range = device_contexts_.equal_range(place);
-      if (range.first == range.second) {
-        PADDLE_THROW(
-            "'Place' is not supported, Please re-compile with WITH_GPU "
-            "option");
-      }
-      // TODO(dzhwinter) : assign the first found device. Will enhanced later.
-      // device load balancer maybe useful here.
-      borrowed_contexts.emplace_back(range.first->second);
-    }
-    return borrowed_contexts;
-  }
-
-  explicit DeviceContextPool(const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    for (size_t i = 0; i < places.size(); i++) {
-      if (platform::is_cpu_place(places[i])) {
-        device_contexts_.emplace(
-            places[i], new platform::CPUDeviceContext(
-                           boost::get<platform::CPUPlace>(places[i])));
-      } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-        device_contexts_.emplace(
-            places[i], new platform::CUDADeviceContext(
-                           boost::get<platform::GPUPlace>(places[i])));
-#else
-        PADDLE_THROW(
-            "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-            "option");
-#endif
-      }
-    }
-  }
-
-  ~DeviceContextPool() {}
-
- private:
-  static DeviceContextPool* pool;
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      return hash_(place.which());
-    }
-  };
-  std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
-                          Hash>
-      device_contexts_;
-  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
-};
-
 class Executor {
 public:
  // TODO(dzhwinter) : Do not rely on this function, it will be removed
  explicit Executor(const platform::DeviceContext& device)
-      : Executor(std::vector<platform::Place>({device.GetPlace()})) {}
-
-  explicit Executor(const platform::Place& place)
-      : Executor(std::vector<platform::Place>({place})) {}
+      : Executor(device.GetPlace()) {}

-  explicit Executor(const std::vector<platform::Place>& places);
+  explicit Executor(const platform::Place& place);

  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
@ -114,10 +38,11 @@ class Executor {
   *  ProgramDesc
   *  Scope
   */
-  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true);
+  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
+           bool create_vars = true);

 private:
-  std::vector<const platform::DeviceContext*> device_contexts_;
+  const platform::Place place_;
 };

 }  // namespace framework
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@ -22,6 +22,14 @@
 namespace paddle {
 namespace framework {

+/*
+  This functor class is responsible for creating the gradient ops for the given
+  operator fwd_op. After it is called (through operator()), the pairs of
+  (gradient variable, corresponding input variable of fwd_op) will be added to
+  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
+  gradient varialbe will be ignored or kEmptyVarName depending on the template
+  argument DropEmptyIG in the derived classes.
+ */
 class GradOpDescMakerBase {
 public:
  explicit GradOpDescMakerBase(
@ -56,6 +64,16 @@ class GradOpDescMakerBase {
    if (!drop_empty_grad) {
      return ret_val;
    }
+    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
+                      "BUG from operator developer:"
+                      " for input argument with a list of variables, "
+                      " drop_empty_grad is not allowed because it makes"
+                      " the correspondence bewteen a variable and its gradient"
+                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " Op type %s",
+                      fwd_op_.Type());
+
    std::vector<std::string> dropped_ret_val;
    dropped_ret_val.reserve(ret_val.size());
    std::copy_if(ret_val.begin(), ret_val.end(),
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@ -14,8 +14,8 @@
 #include <algorithm>
 #include <string>

-#include "paddle/framework/executor.h"
 #include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
 #include "paddle/string/piece.h"

@ -48,13 +48,13 @@ bool InitDevices(const std::vector<std::string> &devices) {
  std::vector<platform::Place> places;
  for (auto &device : devices) {
    auto p = string::Piece(device);
-    if (string::Find(p, ':', 0) == string::Piece::npos) {
+    if (string::HasPrefix(p, "CPU")) {
      places.emplace_back(platform::CPUPlace());
    } else if (string::HasPrefix(p, "GPU")) {
 #ifdef PADDLE_WITH_CUDA
      auto pos = string::RFind(p, ':', string::Piece::npos);
      auto number = device.substr(pos + 1);
-      places.emplace_back(platform::GPUPlace(std::stoi(number)));
+      places.emplace_back(platform::CUDAPlace(std::stoi(number)));
 #else
      LOG(WARNING)
          << "'GPU' is not supported, Please re-compile with WITH_GPU option";
@ -69,10 +69,9 @@ bool InitDevices(const std::vector<std::string> &devices) {
                     return platform::is_cpu_place(place);
                   }) == places.end()) {
    places.emplace_back(platform::CPUPlace());
-    LOG(WARNING) << "Not specified any device, use CPU by Default.";
+    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
  }
-  DeviceContextPool::Create(places);
-  return true;
+  platform::DeviceContextPool::Create(places);
  return true;
 }

--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@ -23,5 +23,9 @@ TEST(Init, InitDevices) {
 #ifdef PADDLE_WITH_CUDA
  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
  ASSERT_EQ(InitDevices(ds2), true);
+
+  // test re-init
+  std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds3), true);
 #endif
 }
--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// For more details about the design of LibraryType, Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
+
+enum LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 };
+
+inline std::string LibraryTypeToString(const LibraryType& library_type) {
+  switch (library_type) {
+    case kPlain:
+      return "PLAIN";
+    case kMKLDNN:
+      return "MKLDNN";
+    case kCUDNN:
+      return "CUDNN";
+    default:
+      PADDLE_THROW("unknown LibraryType %d", library_type);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
+  out << LibraryTypeToString(l);
+  return out;
+}
+
+}  // namespace
+}  // framework
--- a/Show More
+++ b/Show More