Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/hide_api_cont

7 years ago · 4ff1bde5fb
parent 4dccb58483 ebe3b5e78a
commit 4ff1bde5fb
38 changed files with 2773 additions and 825 deletions
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <stdexcept>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
      }
    }
  }
+  std::vector<framework::LoDTensor> fetch_data;
+  std::exception_ptr eptr;
+  try {
+    fetch_data = underlying_executor_->Run(fetch_tensors);
+  } catch (...) {
+    eptr = std::current_exception();
+  }

-  auto fetch_data = underlying_executor_->Run(fetch_tensors);
  drop_scope_counter_ += 1;
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
      scope->DeleteScope(local_scope);
    }
  }
-  return fetch_data;
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  } else {
+    return fetch_data;
+  }
 }
 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -78,6 +78,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    set.clear();
  };

+  // Clean run context
+  run_op_futures_.clear();
+  exception_.reset();
+
  // Step 3. Execution
  while (!pending_vars.empty()) {
    // 1. Run All Ready ops
@ -96,16 +100,19 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);

    if (timeout) {
-      std::lock_guard<std::mutex> l(exception_mu_);
+      std::unique_lock<std::mutex> l(exception_mu_);
      if (exception_) {
+        l.unlock();
+        for (auto &run_op_future : run_op_futures_) {
+          run_op_future.wait();
+        }
+        l.lock();
        std::exception *exp = exception_.get();
        if (dynamic_cast<platform::EOFException *>(exp)) {
          auto e = *static_cast<platform::EOFException *>(exp);
-          exception_.reset();
          throw e;
        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          exception_.reset();
          throw e;
        } else {
          LOG(FATAL) << "Unknown exception.";
@ -222,7 +229,7 @@ void ThreadedSSAGraphExecutor::RunOp(
    }
  };
  if (pool_) {
-    pool_->enqueue(op_run);
+    run_op_futures_.emplace_back(pool_->enqueue(op_run));
  } else {
    op_run();
  }
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@ -15,6 +15,7 @@
 #pragma once

 #include <deque>
+#include <list>
 #include <string>
 #include <unordered_set>
 #include <utility>
@ -77,6 +78,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {

 private:
  ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
 };

 }  // namespace details
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -95,7 +95,7 @@ ParallelExecutor::ParallelExecutor(
  }

  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToGPUs(bcast_vars);
+    BCastParamsToDevs(bcast_vars);
  }
  // Startup Program has been run. All local scopes has correct parameters.

@ -131,7 +131,7 @@ ParallelExecutor::ParallelExecutor(
      member_->places_, std::move(member_->executor_)));
 }

-void ParallelExecutor::BCastParamsToGPUs(
+void ParallelExecutor::BCastParamsToDevs(
    const std::unordered_set<std::string> &vars) const {
  // the the initializing bcast, all vars would be bcast from device(0),
  // otherwise
@ -202,7 +202,11 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
    } else {
      platform::CPUPlace cpu;
-      for (size_t i = 1; i < member_->places_.size(); ++i) {
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id))
+          continue;
+
        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
        t->Resize(dims);
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -66,7 +66,7 @@ class ParallelExecutor {
  void Run(const std::vector<std::string> &fetch_tensors,
           const std::string &fetched_var_name);

-  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
+  void BCastParamsToDevs(const std::unordered_set<std::string> &vars) const;

 private:
  ParallelExecutorPrivate *member_;
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@ -29,11 +29,11 @@ enum ReaderStatus { kRunning, kStopped };

 class ReaderBase {
 public:
-  void ReadNext(std::vector<LoDTensor>* out);
+  virtual void ReadNext(std::vector<LoDTensor>* out);

-  void Shutdown();
+  virtual void Shutdown();

-  void Start();
+  virtual void Start();

  // Return the readers which are the end of decorating chain. Basically
  // they are readers just before read op.
@ -42,7 +42,7 @@ class ReaderBase {
  virtual ~ReaderBase();

 protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}

  virtual void ShutdownImpl() {}

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -259,12 +259,15 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(unsqueeze_op DEPS reshape_op)
+op_library(squeeze_op DEPS reshape_op)

 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Organize the classes into a binary tree. At each node, a sigmoid function
+ * is used to calculate the probability of belonging to the right branch.
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
+ * Hierarchical Probabilistic Neural Network Language Model."
+ *
+ * Here we uses a simple way of making the binary tree.
+ * Assuming the number of classes C = 6,
+ * The classes are organized as a binary tree in the following way:
+ *
+ * @code{.py}
+ * *-*-*- 2
+ * | | |- 3
+ * | |
+ * | |-*- 4
+ * |   |- 5
+ * |
+ * |-*- 0
+ *   |- 1
+ * @endcode
+ *
+ * where * indicates an internal node, and each leaf node represents a class.
+ * - Node 0 ... C-2 are internal nodes.
+ * - Node C-1 ... 2C-2 are leaf nodes.
+ * - Class c is represented by leaf node \f$c+C-1\f$.
+ *
+ * We assign an id for each node:
+ * - the id of root be 0.
+ * - the left child of a node i is 2*i+1.
+ * - the right child of a node i is 2*i+2.
+ *
+ * It's easy to see that:
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
+ * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
+ * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
+ *
+ */
+
+class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
+                   "Output(PreOut) should not be null.");
+    const int64_t batch_size = ctx->GetInputDim("X")[0];
+    std::vector<int64_t> output_shape({batch_size, 1});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+template <typename AttrType>
+class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, required) The input tensor with shape [N, D], "
+             "where N is the size of mini-batch, and D is the feature size.");
+    AddInput("W",
+             "(Tensor, required), The parameters of hierarchical "
+             "sigmoid operator, each of them is a 2-D tensor, the shape is"
+             "[num_classes - 1, D].");
+    AddInput("Label",
+             "(Tensor, required), The labels of training data. It's a"
+             "tensor with shape [N, 1].");
+    AddInput("Bias",
+             "(Tensor, optional), The bias is a tensor with shape"
+             "[1, num_classes - 1].");
+    AddOutput("Out",
+              "(Tensor, required) The output of hierarchical sigmoid operator."
+              "The shape is [N, 1].");
+    AddOutput("PreOut",
+              "(Tensor, required) A intermedia 2-D tensor with shape "
+              "[batch_size, code_length], where code_length represents the "
+              "maximum path length from root to leaf nodes.")
+        .AsIntermediate();
+    AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
+        .SetDefault(2);
+    AddComment(R"DOC(
+The hierarchical sigmoid operator organize the classes into a binary tree.
+At each node, a sigmoid function is used to calculate the probability of
+belonging to the right branch. This idea is from
+"F. Morin, Y. Bengio (AISTATS 05):
+Hierarchical Probabilistic Neural Network Language Model."
+      )DOC");
+  }
+};
+
+class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PreOut"),
+                   "Input(Preout) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
+                   "Output(W@Grad should not be null.)");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid_grad,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/fluid/platform/transform.h"
+namespace paddle {
+namespace operators {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+using platform::Transform;
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* pre_out = ctx.Output<framework::Tensor>("PreOut");
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    int64_t code_length = math::FindLastSet(num_classes - 1);
+    int64_t batch_size = in->dims()[0];
+    framework::Tensor sum;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto* pre_out_data = pre_out->mutable_data<T>(
+        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
+    // 0s can avoid out of path's loss.
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, pre_out, static_cast<T>(0.0));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::RowwiseSum<DeviceContext, T> row_sum;
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    std::vector<int64_t> sum_dims({batch_size, 1UL});
+    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
+    auto sum_mat = EigenMatrix<T>::From(sum);
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_mat = framework::EigenVector<T>::Flatten(*out);
+    if (bias) {
+      bit_code.Add(pre_out, *bias);
+    }
+    bit_code.Mul(pre_out, *w, *in);
+    // clip to [-40, 40]
+    Transform<DeviceContext> trans;
+    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
+          pre_out_data + pre_out->numel(), pre_out_data,
+          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
+    bit_code.Sum(*pre_out, out, static_cast<T>(-1));
+    // use softrelu to calculate cross entropy
+    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
+    row_sum(dev_ctx, *pre_out, &sum);
+    // TODO(guosheng): Subtract the out of path's loss, since not all
+    // class(leaf) nodes' path lengths equal code_length. But it won't break the
+    // gradient check since both have the out of path's loss and will cancel out
+    // each other.
+    out_mat.device(place) = sum_mat + out_mat;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
+    auto* bias_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* pre_out = ctx.Input<framework::Tensor>("PreOut");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor pre_out_grad;
+
+    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    w_grad->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, w_grad, static_cast<T>(0.0));
+
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+
+    // softrelu derivative
+    pre_out_grad_mat.device(place) =
+        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    bit_code.Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+    pre_out_grad_mat.device(place) =
+        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
+    // be consistent with the clipping in forward.
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+      bit_code.AddGrad(pre_out_grad, bias_grad);
+    }
+    bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
+    bit_code.MulGradError(pre_out_grad, *w, in_grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@ -51,6 +51,7 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+math_library(matrix_bit_code)
 math_library(unpooling)
 math_library(vol2col)

--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@ -155,7 +155,7 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
    auto height = in_dims[0];
    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(out->numel(), height);

    T* out_buf = out->mutable_data<T>(out->place());
    const T* in_buf = input.data<T>();
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@ -0,0 +1,176 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include <iostream>
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
+                                  const framework::Tensor& vec) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t batch_size = tmat->dims()[0];
+  size_t width = tmat->dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      tmat->data<T>()[i * width + j] += vec.data<T>()[index];
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
+                                      framework::Tensor* vec) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t batch_size = tmat.dims()[0];
+  size_t width = tmat.dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      vec->data<T>()[index] += tmat.data<T>()[i * width + j];
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
+                                  framework::Tensor* sum, T scale_sum) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t o_width = tmat.dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    T sm = static_cast<T>(0.0);
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        // calc_bit starts from right most bit, while data in tmat[i] is in the
+        // reverse order.
+        sm += tmat.data<T>()[i * o_width + j];
+      }
+    }
+    sum->data<T>()[i] = scale_sum * sm;
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
+                                  const framework::Tensor& weight,
+                                  const framework::Tensor& input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat->dims()[0];
+  size_t tmat_width = tmat->dims()[1];
+  size_t input_width = input.dims()[1];
+  size_t weight_width = weight.dims()[1];
+  auto tmat_value = tmat->data<T>();
+  auto weight_value = weight.data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      T sum = static_cast<T>(0.0);
+      for (size_t k = 0; k < input_width; ++k) {
+        sum += weight_value[weight_width * index + k] *
+               input_value[input_width * i + k];
+      }
+      tmat_value[i * tmat_width + j] += sum;
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
+                                            framework::Tensor* weight,
+                                            const framework::Tensor& input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t input_width = input.dims()[1];
+  size_t tmat_width = tmat.dims()[1];
+  size_t weight_width = weight->dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight->data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+
+      for (size_t k = 0; k < input_width; ++k) {
+        weight_value[weight_width * index + k] +=
+            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
+      }
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
+                                           const framework::Tensor& weight,
+                                           framework::Tensor* input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t tmat_width = tmat.dims()[1];
+  size_t input_width = input->dims()[1];
+  size_t weight_width = weight.dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight.data<T>();
+  auto input_value = input->data<T>();
+
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+
+      for (size_t k = 0; k < input_width; ++k) {
+        input_value[input_width * i + k] +=
+            tmat_value[i * tmat_width + j] *
+            weight_value[weight_width * index + k];
+      }
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat->dims()[0];
+  size_t o_width = tmat->dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        tmat->data<T>()[i * o_width + j] -= 1;
+      }
+    }
+  }
+}
+
+template class MatrixBitCodeFunctor<float>;
+template class MatrixBitCodeFunctor<double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@ -0,0 +1,143 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+/**
+ * SimpleCodeTable class should support 3 functions:
+ *
+ * size_t size()
+ *   return the number of ids
+ *
+ * int get_max_code_length()
+ *   return the maximal code length
+ *
+ * SimpleCode operator()(size_t i)
+ *   return the i-th code. Code class is descriebed below.
+ *
+ * SimpleCode class should support 3 functions:
+ *
+ * int get_length()
+ *   return the length of the code
+ *
+ * size_t cal_index(int bit)
+ *   bit ranges from 0 to get_length() - 1
+ *   return the index for the (1+bit) level parent
+ *
+ * bool calc_bit(int bit)
+ *   return true if the bit level parent is the right child of (1+bit) level
+ *   parent
+ *
+ */
+
+/**
+ * return the 1-based index of the highest bit set
+ *
+ * for x > 0:
+ * \f[
+ *    FindLastSet(x) = 1 + \floor*{\log_{2}x}
+ * \f]
+ */
+inline constexpr size_t FindLastSet(size_t x) {
+  return std::is_same<size_t, unsigned int>::value
+             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+             : (std::is_same<size_t, unsigned long>::value  // NOLINT
+                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+}
+
+struct SimpleCode {
+  SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
+  /**
+   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * is `c + num_classes` and all siblings can get the same weight indice using
+   * prefixes.
+   * Weight index is the prefixes of encoding, thus leave out the right most
+   * bit in calc_index.
+   * Binary classification path is the suffixes of encoding, thus leave out the
+   * left most bit in calc_bit.
+   */
+  inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
+  inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
+  inline int get_length() const { return FindLastSet(c_) - 1; }
+
+ private:
+  size_t c_;
+};
+
+struct SimpleCodeTable {
+  explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {}
+  SimpleCode operator()(size_t code) const {
+    return SimpleCode(code, num_classes_);
+  }
+  size_t size() const { return num_classes_; }
+  int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
+
+ private:
+  size_t num_classes_;
+};
+
+template <typename T>
+class MatrixBitCodeFunctor {
+ public:
+  explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
+      : num_classes_(num_classes), ids_(ids) {}
+  /* For j < code_length
+       tmat(i, j) += vec(0, index(i, j))
+  */
+  void Add(framework::Tensor* tmat, const framework::Tensor& vec);
+
+  /* For j < code_length
+       vec(0, index(i, j)) += tmat(i, j)
+  */
+  void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
+
+  /* For j < code_length
+    sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
+  */
+  void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum);
+
+  /* For j < code_length
+       tmat(i, j) -= bit(i, j)
+  */
+  void Sub(framework::Tensor* tmat);
+  /* For j < code_length
+       input.row(i) += tmat(i, j) * weight.row(index(i, j))
+  */
+  void Mul(framework::Tensor* tmat, const framework::Tensor& weight,
+           const framework::Tensor& input);
+
+  /* For index(i, j) >= 0:
+      weight.row(index(i, j)) += tmat(i, j) * input.row(i)
+  */
+  void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
+                     const framework::Tensor& input);
+  /* For j < code_length
+    input.row(i) += tmat(i, j) * weight.row(index(i, j))
+  */
+  void MulGradError(const framework::Tensor& tmat,
+                    const framework::Tensor& weight, framework::Tensor* input);
+
+  size_t num_classes_;
+  const int64_t* ids_;
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@ -81,6 +81,15 @@ class BlockingQueue {
    }
  }

+  void ReOpen() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    closed_ = false;
+    std::deque<T> new_deque;
+    queue_.swap(new_deque);
+    send_cv_.notify_all();
+    receive_cv_.notify_all();
+  }
+
  void Close() {
    std::lock_guard<std::mutex> lock(mutex_);
    closed_ = true;
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@ -23,7 +23,7 @@ class BatchReader : public framework::DecoratedReader {
  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
              bool discard_leftover)
      : DecoratedReader(reader),
-        batch_size_(batch_size),
+        batch_size_(static_cast<size_t>(batch_size)),
        discard_leftover_(discard_leftover) {
    buffer_.reserve(batch_size_);
  }
@ -31,7 +31,7 @@ class BatchReader : public framework::DecoratedReader {
  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;

 private:
-  int batch_size_;
+  size_t batch_size_;
  bool discard_leftover_;
  std::vector<std::vector<framework::LoDTensor>> buffer_;
 };
@ -78,7 +78,7 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
 void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
  buffer_.clear();
  buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
+  for (size_t i = 0; i < batch_size_; ++i) {
    buffer_.push_back(std::vector<framework::LoDTensor>());
    reader_->ReadNext(&buffer_.back());
    if (buffer_.back().empty()) {
@ -95,9 +95,9 @@ void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
    // if buffer_ is empty, the 'out' will return as an empty vector.
    return;
  }
-  int out_num = buffer_[0].size();
+  size_t out_num = buffer_[0].size();
  out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
+  for (size_t j = 0; j < out_num; ++j) {
    // Merge shape and check date type
    std::type_index batch_type = buffer_[0][j].type();
    framework::DDim batch_shape = buffer_[0][j].dims();
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@ -27,19 +27,17 @@ class PyReader : public framework::FileReader {
    queue_ = queue;
  }

-  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
    bool success;
    *out = queue_->Pop(&success);
    if (!success) out->clear();
  }

- private:
-  void ShutdownImpl() override { /* TODO */
-  }
+  void Shutdown() override { queue_->Close(); }

-  void StartImpl() override { /* TODO */
-  }
+  void Start() override { queue_->ReOpen(); }

+ private:
  std::shared_ptr<LoDTensorBlockingQueue> queue_;
 };

--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@ -58,12 +58,15 @@ class LoDTensorBlockingQueue {

  inline size_t Size() const { return queue_.Size(); }

-  inline void Close() { return queue_.Close(); }
+  inline void ReOpen() { queue_.ReOpen(); }
+
+  inline void Close() { queue_.Close(); }

  inline bool IsClosed() const { return queue_.IsClosed(); }

 private:
-  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+  void CheckDims(
+      const std::vector<framework::LoDTensor>& lod_tensor_vec) const {
    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
                   "Expect input size is %d but found %s", dims_.size(),
                   lod_tensor_vec.size());
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@ -0,0 +1,202 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class SqueezeOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SqueezeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SqueezeOp should not be null.");
+
+    const auto &x_dims = ctx->GetInputDim("X");
+    // Check input tensor dims (<6) Eigen limit.
+    PADDLE_ENFORCE(x_dims.size() <= 6,
+                   "Invalid dimnesions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit).");
+
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    for (int a : axes) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The squeeze axis should be less than input "
+                        "tensor's rank.");
+    }
+
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
+                                        const framework::DDim &in_dims) {
+    size_t num_squeeze_dims = squeeze_dims.size();
+    int cnt_squeezed_dims = 0;
+    bool should_squeeze[9] = {false};
+
+    // Determines number of dimensions of output tensor after squeeze.
+    // Mark and count the dimensions need to be squeezed
+    if (num_squeeze_dims == 0) {
+      for (int idx = 0; idx < in_dims.size(); ++idx) {
+        if (in_dims[idx] == 1) {
+          should_squeeze[idx] = true;
+          ++cnt_squeezed_dims;
+        }
+      }
+    } else {
+      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
+        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
+                                            : squeeze_dims[idx];
+        // Check current index, the upper limit has beed checked in line 36.
+        PADDLE_ENFORCE(current >= 0,
+                       "Invalid axis, the negative axis is out of range.");
+        PADDLE_ENFORCE(in_dims[current] == 1,
+                       "Invalid axis index, the axis that will be squeezed "
+                       "should be equal to 1.");
+
+        if (!(should_squeeze[current])) {
+          ++cnt_squeezed_dims;
+        }
+        should_squeeze[current] = true;
+      }
+    }
+
+    // Make output dimensions
+    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
+    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
+      if (!should_squeeze[in_idx]) {
+        output_shape[out_idx++] = in_dims[in_idx];
+      }
+    }
+
+    return framework::make_ddim(output_shape);
+  }
+};
+
+class SqueezeOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
+    AddAttr<std::vector<int>>("axes",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to squeeze.")
+        .SetDefault({});
+    AddAttr<bool>("inplace",
+                  "(default: false) Squeeze the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
+    AddComment(R"DOC(
+        Squeeze Operator.
+        
+        Remove single-dimensional entries from the shape of a tensor. 
+        Takes a parameter axes with a list of axes to squeeze. 
+        If axes is not provided, all the single dimensions will be removed from the shape. 
+        If an axis is selected with shape entry not equal to one, an error is raised.
+        
+        Examples:
+        Case 1:
+          Given 
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = [0]
+          we get:
+            Out.shape = (3, 1, 5)
+
+        Case 2:
+          Given
+            X.shape = (1, 3, 1, 5)
+          and 
+            axes = []
+          we get:
+            Out.shape = (3, 5)
+    )DOC");
+  }
+};
+
+class SqueezeGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class SqueezeGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+// Tell linker to use reshape op
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
+                  ops::SqueezeOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@ -0,0 +1,191 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class UnsqueezeOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnsqueezeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnsqueezeOp should not be null.");
+
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    const auto &x_dims = ctx->GetInputDim("X");
+    // Validity Check: input tensor dims (<6).
+    PADDLE_ENFORCE(x_dims.size() <= 6,
+                   "Invalid dimensions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit)");
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
+                                        const framework::DDim &in_dims) {
+    int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+    int cur_output_size = in_dims.size();
+    std::vector<int64_t> output_shape(output_size, 0);
+
+    // Validity Check: rank range.
+    PADDLE_ENFORCE(output_size <= 6,
+                   "The output tensor's rank should be less than 6.");
+
+    for (int axis : unsqz_dims) {
+      int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+      // Vaildity Check: the axis bound
+      PADDLE_ENFORCE(
+          cur >= 0 && cur <= cur_output_size,
+          "The unsqueeze dims must be within range of current rank.");
+      // Move old axis, and insert new axis
+      for (int i = cur_output_size; i >= cur; --i) {
+        if (output_shape[i] == 1) {
+          // Move axis
+          output_shape[i + 1] = 1;
+          output_shape[i] = 0;
+        }
+      }
+      output_shape[cur] = 1;
+      // Add the output size.
+      cur_output_size++;
+    }
+
+    // Make output shape
+    for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+      if (output_shape[out_idx] == 0) {
+        output_shape[out_idx] = in_dims[in_idx++];
+      }
+    }
+
+    return framework::make_ddim(output_shape);
+  }
+};
+
+class UnsqueezeOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of unsqueeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
+    AddAttr<std::vector<int>>("axes",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to be inserted")
+        .AddCustomChecker([](const std::vector<int> &axes) {
+          PADDLE_ENFORCE(!axes.empty(),
+                         "Invalid axes, The unsqueeze axes is empty.");
+          // Validity Check: axes dims (<6).
+          PADDLE_ENFORCE(static_cast<int>(axes.size()) < 6,
+                         "Invalid dimensions, dynamic dimensions should be "
+                         "within [1, 6] dimensions (Eigen limit).");
+          // Validity Check: the range of unsqueeze aixs.
+          for (int axis : axes) {
+            PADDLE_ENFORCE(axis < 6,
+                           "Invalid dimensions, input axis should be"
+                           " within [1, 6] dimensions (Eigen limit).");
+          }
+        });
+    AddAttr<bool>(
+        "inplace",
+        "(default: false) Unsqueeze the source tensor's shape without "
+        "memory copy. When Attr(inplace) is set true, the output "
+        "tensor shares memory with Input(X), otherwise, a new output "
+        "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
+    AddComment(R"DOC(
+    Unsqueeze Operator.
+    
+    Insert single-dimensional entries to the shape of a tensor. 
+    Takes one required argument axes, a list of dimensions that will be inserted. 
+    Dimension indices in axes are as seen in the output tensor. 
+
+    For example: 
+      Given a tensor such that tensor with shape [3, 4, 5], 
+      then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
+    )DOC");
+  }
+};
+
+class UnsqueezeGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class UnsqueezeGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+// Tell linker to use reshape op.
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
+                  ops::UnsqueezeOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
+                  ops::UnsqueezeGradInferShape);
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@ -14,6 +14,7 @@ limitations under the License. */
 #include <Python.h>
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT // for call_once
 #include <string>
 #include <unordered_map>
@ -66,6 +67,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }

+bool IsCompiledWithDIST() {
+#ifdef PADDLE_WITH_DIST
+  return true;
+#else
+  return false;
+#endif
+}
+
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of PaddlePaddle");

@ -302,7 +311,8 @@ All parameter, weight, gradient are variables in Paddle.
      ::paddle::operators::reader::LoDTensorBlockingQueue;
  using LoDTensorBlockingQueueHolder =
      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
-  py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
+  py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
+      m, "LoDTensorBlockingQueue", "")
      .def("push",
           [](LoDTensorBlockingQueue &self,
              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
@ -317,7 +327,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("init_lod_tensor_blocking_queue",
        [](Variable &var, size_t capacity,
           const std::vector<std::vector<int64_t>> &shapes)
-            -> LoDTensorBlockingQueue * {
+            -> std::shared_ptr<LoDTensorBlockingQueue> {
              std::vector<DDim> dims(shapes.size());
              std::transform(shapes.begin(), shapes.end(), dims.begin(),
                             [](const std::vector<int64_t> &shape) {
@ -325,9 +335,9 @@ All parameter, weight, gradient are variables in Paddle.
                             });
              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
              holder->InitOnce(capacity, dims);
-              return holder->GetQueue().get();
+              return holder->GetQueue();
            },
-        py::return_value_policy::reference);
+        py::return_value_policy::copy);

  py::class_<Scope>(m, "Scope", "")
      .def("var",
@ -508,6 +518,7 @@ All parameter, weight, gradient are variables in Paddle.
        [](bool init_p2p) { framework::InitDevices(init_p2p); });

  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_dist", IsCompiledWithDIST);
 #ifdef PADDLE_WITH_CUDA
  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
    // Only GPUs with Compute Capability >= 53 support float16
@ -534,6 +545,8 @@ All parameter, weight, gradient are variables in Paddle.
      });

  py::class_<LoDTensorArray>(m, "LoDTensorArray")
+      .def("__init__",
+           [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); })
      .def("__getitem__",
           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
           py::return_value_policy::reference)
@ -656,7 +669,7 @@ All parameter, weight, gradient are variables in Paddle.
                  const std::string &, Scope *, std::vector<Scope *> &,
                  const ExecutionStrategy &, const BuildStrategy &, size_t,
                  size_t>())
-      .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
+      .def("bcast_params", &ParallelExecutor::BCastParamsToDevs)
      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
      // We still cannot get local_scope from this vector, since the element
      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@ -92,8 +92,15 @@ install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
    DESTINATION opt/paddle/share/wheels
 )

-find_program(PATCHELF_EXECUTABLE patchelf)
-if(NOT PATCHELF_EXECUTABLE)
-  message(FATAL_ERROR "patchelf not found, please install it.\n"
-          "For Ubuntu, the command is: apt-get install -y patchelf.")
-endif()
+if(APPLE)
+  find_program(INSTALL_NAME_TOOL_EXECUTABLE install_name_tool)
+  if(NOT INSTALL_NAME_TOOL_EXECUTABLE)
+    message(FATAL_ERROR "install_name_tool not found, please check.\n")
+  endif()
+else(APPLE)
+  find_program(PATCHELF_EXECUTABLE patchelf)
+  if(NOT PATCHELF_EXECUTABLE)
+    message(FATAL_ERROR "patchelf not found, please install it.\n"
+            "For Ubuntu, the command is: apt-get install -y patchelf.")
+  endif()
+endif(APPLE)
--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -44,7 +44,7 @@ import metrics
 import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
 from transpiler import DistributeTranspiler, InferenceTranspiler, \
    memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
@ -65,13 +65,14 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
              'io',
              'initializer',
              'layers',
-              'transpiler'
+              'transpiler',
              'nets',
              'optimizer',
              'learning_rate_decay',
              'backward',
              'regularizer',
              'LoDTensor',
+              'LoDTensorArray',
              'CPUPlace',
              'CUDAPlace',
              'CUDAPinnedPlace',
@ -121,6 +122,9 @@ def __bootstrap__():
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
        'init_allocated_mem'
    ]
+    if core.is_compiled_with_dist():
+        read_env_flags.append('rpc_deadline')
+
    if core.is_compiled_with_cuda():
        read_env_flags += [
            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@ -24,7 +24,8 @@ from layer_function_generator import generate_layer_fn, templatedoc
 __all__ = [
    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
-    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
+    'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor',
+    'load'
 ]


@ -445,6 +446,88 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
    return monkey_patch_reader_methods(main_prog_var)


+def py_reader(capacity, shapes, dtypes, lod_levels=None):
+    """
+    Create a reader and blocking queue for data feeding in Python
+    
+    This layer returns a Reader Variable and a BlockingQueue.
+    The BlockingQueue provides `push()` method to push a `LoDTensorArray` 
+    object into the queue in Python side. In C++ side, the Reader 
+    Variable would invoke `pop()` method of the queue to retrieve the 
+    feeding data. The process of feeding data in Python side and fetching 
+    data in C++ side can run in parallel. The BlockingQueue should be closed 
+    using `close()` method when unused.
+
+    Args:
+       capacity(int): The maximum capacity of the BlockingQueue.
+       shapes(list): List of tuples which declaring data shapes.
+       dtypes(list): List of strs which declaring data type. 
+       lod_levels(list): List of ints which declaring data lod_level.
+
+    Returns:
+       tuple(Variable, BlockingQueue):
+       A Reader Variable from which we can get feeding data.
+       
+       A BlockingQueue object for data feeding.
+
+    Examples:
+
+        .. code-block:: python
+
+            reader, queue = fluid.layers.py_reader(
+                                             capacity=10,
+                                             shapes=[[-1,3,224,224], [-1,1]],
+                                             dtypes=['float32', 'int64'])
+            # Via the reader, we can use 'read_file' layer to get data:
+            image, label = fluid.layers.read_file(reader)
+            
+            # Via the blocking queue, we can feed data using threads
+            def feed_data(queue, feed_images, feed_labels):
+                for feed_image, feed_label in zip(feed_images, feed_labels):
+                    data = core.LoDTensorArray()
+                    data.append(feed_image)
+                    data.append(feed_label)
+                    queue.push(data)
+            
+            thread = threading.Thread(target=feed_data, args=(queue, feed_images, feed_labels))
+            thread.start()
+    """
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    if lod_levels is None:
+        lod_levels = [0] * len(shapes)
+
+    queue_name = unique_name('lod_tensor_blocking_queue')
+    var = global_scope().var(queue_name)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=unique_name('create_py_reader'))
+    startup_blk.append_op(
+        type='create_py_reader',
+        inputs={'blocking_queue': queue_name},
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    return monkey_patch_reader_methods(main_prog_var), feed_queue
+
+
 def open_files(filenames,
               shapes,
               lod_levels,
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -85,6 +85,7 @@ __all__ = [
    'transpose',
    'im2sequence',
    'nce',
+    'hsigmoid',
    'beam_search',
    'row_conv',
    'multiplex',
@ -3871,6 +3872,74 @@ def nce(input,
    return cost / (num_neg_samples + 1)


+def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
+    """
+    The hierarchical sigmoid operator is used to accelerate the training
+    process of language model. This operator organizes the classes into a 
+    complete binary tree, each leaf node represents a class(a word) and each
+    internal node acts as a binary classifier. For each word there's a unique
+    path from root to it's leaf node, hsigmoid calculate the cost for each
+    internal node on the path, and sum them to get a total cost. hsigmoid can
+    achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the size of word dict.
+
+    Refer to `Hierarchical Probabilistic Neural Network Language Model
+    <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
+    
+    Args:
+        input (Variable): The input tensor variable with shape 
+            :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
+            and :math:`D` is the feature size.
+        label (Variable): The tensor variable contains labels of training data.
+            It's a tensor with shape is :math:`[N \\times 1]`.
+        num_classes: (int), The number of classes, must not be less than 2.
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter
+             attribute for learnable parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None):  The parameter 
+             attribute for the bias of this layer. If it is set to False, no
+             bias will be applied.
+
+    Returns:
+        Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[2], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='int64')
+            out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
+    """
+
+    helper = LayerHelper('hierarchical_sigmoid', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    pre_out = helper.create_tmp_variable(dtype)
+    dim = input.shape[1]
+    if num_classes < 2:
+        raise ValueError("num_classes must not be less than 2.")
+    weights = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[num_classes - 1, dim],
+        is_bias=False,
+        dtype=input.dtype)
+    inputs = {"X": input, "W": weights, "Label": label}
+    if helper.bias_attr:
+        bias = helper.create_parameter(
+            attr=helper.bias_attr,
+            shape=[1, num_classes - 1],
+            is_bias=True,
+            dtype=input.dtype)
+        inputs['Bias'] = bias
+    helper.append_op(
+        type="hierarchical_sigmoid",
+        inputs=inputs,
+        outputs={"Out": out,
+                 "PreOut": pre_out},
+        attrs={"num_classes": num_classes})
+    return out
+
+
 def transpose(x, perm, name=None):
    """
    Permute the dimensions of `input` according to `perm`.
--- a/Show More
+++ b/Show More