Merge branch 'develop' into warpctc_deps

7 years ago · 3ed8528a40
parent ca636eedb9 d8b923ab04
commit 3ed8528a40
40 changed files with 1609 additions and 154 deletions
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@ -18,6 +18,11 @@ dynamic_lstm
 ..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
    :noindex:
 dynamic_gru
 -----------
 ..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
    :noindex:
 data
 ----
 ..  autofunction:: paddle.v2.fluid.layers.data
@ -500,6 +505,11 @@ swish
 ..  autofunction:: paddle.v2.fluid.layers.swish
    :noindex:
 im2sequence
 ------
 ..  autofunction:: paddle.v2.fluid.layers.im2sequence
    :noindex:
 edit_distance
 ---------------
 ..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@ -25,14 +25,14 @@
  .. code-block:: bash
-     docker pull docker.paddlepaddle.org/paddle
+     docker pull docker.paddlepaddlehub.com/paddle
 下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
  .. code-block:: bash
     docker pull paddlepaddle/paddle:latest-gpu
-     docker pull docker.paddlepaddle.org/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
 选择下载使用不同的BLAS库的Docker镜像：
@ -49,7 +49,7 @@
     docker pull paddlepaddle/paddle:[tag]
     # 比如：
-     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
 .. _docker_run:
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@ -26,14 +26,14 @@ For users in China, we provide a faster mirror:
  .. code-block:: bash
-     docker pull docker.paddlepaddle.org/paddle
+     docker pull docker.paddlepaddlehub.com/paddle
 Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
  .. code-block:: bash
     docker pull paddlepaddle/paddle:latest-gpu
-     docker pull docker.paddlepaddle.org/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
 Choose between different BLAS version:
@ -53,7 +53,7 @@ and run:
     docker pull paddlepaddle/paddle:[tag]
     # i.e.
-     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
 .. _docker_run:
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
      }
      return val;
    }
    case proto::AttrType::LONG: {
      return attr_desc.l();
    }
    default:
      PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
  }
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@ -168,6 +168,32 @@ struct ExtractAttribute<bool> {
  const std::string& attr_name_;
 };
 template <>
 struct ExtractAttribute<int64_t> {
  explicit ExtractAttribute(const std::string& attr_name)
      : attr_name_(attr_name) {}
  int64_t* operator()(Attribute& attr) const {
    if (attr.type() == typeid(int)) {  // NOLINT
      int val = boost::get<int>(attr);
      attr = static_cast<int64_t>(val);
    } else if (attr.type() == typeid(float)) {  // NOLINT
      int val = boost::get<float>(attr);
      attr = static_cast<int64_t>(val);
    }
    int64_t* attr_value = nullptr;
    try {
      attr_value = &boost::get<int64_t>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
                   attr_name_, attr.type().name());
    }
    return attr_value;
  }
  const std::string& attr_name_;
 };
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@ -75,7 +75,7 @@ std::vector<VarDesc *> BlockDesc::AllVars() const {
 OpDesc *BlockDesc::AppendOp() {
  need_update_ = true;
-  ops_.emplace_back(new OpDesc());
+  ops_.emplace_back(new OpDesc(this));
  return ops_.back().get();
 }
@ -86,7 +86,7 @@ void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
 OpDesc *BlockDesc::PrependOp() {
  need_update_ = true;
-  ops_.emplace_front(new OpDesc());
+  ops_.emplace_front(new OpDesc(this));
  return ops_.front().get();
 }
@ -153,7 +153,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
    vars_[var_desc.name()].reset(new VarDesc(var_desc));
  }
  for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog));
+    ops_.emplace_back(new OpDesc(op_desc, prog, this));
  }
 }
@ -162,7 +162,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
    : prog_(prog), desc_(desc) {
  need_update_ = true;
  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op));
+    ops_.emplace_back(new OpDesc(*op, this));
  }
  for (auto &it : other.vars_) {
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@ -26,6 +26,7 @@ enum AttrType {
  BOOLEAN = 6;
  BOOLEANS = 7;
  BLOCK = 8;
  LONG = 9;
 }
 // OpDesc describes an instance of a C++ framework::OperatorBase
@ -44,6 +45,7 @@ message OpDesc {
    optional bool b = 10;
    repeated bool bools = 11;
    optional int32 block_idx = 12;
    optional int64 l = 13;
  };
  message Var {
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@ -97,7 +97,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
  need_update_ = true;
 }
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
    : desc_(desc), need_update_(false) {
  // restore inputs_
  int input_size = desc_.inputs_size();
@ -131,6 +131,7 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
      attrs_[attr_name] = prog->MutableBlock(bid);
    }
  }
  this->block_ = block;
 }
 proto::OpDesc *OpDesc::Proto() {
@ -282,6 +283,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
    VectorToRepeated(v, attr_->mutable_bools());
  }
  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
  void operator()(int64_t v) const { attr_->set_l(v); }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@ -25,7 +25,6 @@ namespace framework {
 class BlockDesc;
 class ProgramDesc;
 class OpDesc {
 public:
  OpDesc() {}
@ -33,7 +32,14 @@ class OpDesc {
  OpDesc(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const AttributeMap &attrs);
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
  explicit OpDesc(BlockDesc *block) : block_(block) {}
  OpDesc(const OpDesc &other, BlockDesc *block) {
    *this = other;
    block_ = block;
  }
  void CopyFrom(const OpDesc &op_desc);
@ -117,6 +123,10 @@ class OpDesc {
  void Flush();
  BlockDesc *Block() { return this->block_; }
  void SetBlock(BlockDesc *block) { this->block_ = block; }
 private:
  template <typename MapType>
  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
@ -129,6 +139,7 @@ class OpDesc {
  }
  proto::OpDesc desc_;
  BlockDesc *block_;  // not_own
  // input arg name => input variable names
  VariableNameMap inputs_;
  // output arg name => output variable names
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@ -35,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*>;
+                   std::vector<bool>, BlockDesc*, int64_t>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@ -66,6 +66,8 @@ class VarDesc {
  std::string Name() const { return desc_.name(); }
  void SetName(std::string name) { desc_.set_name(name); }
  void SetShape(const std::vector<int64_t> &dims);
  void SetDataType(proto::DataType data_type);
--- a/paddle/framework/variable_test.cc
+++ b/paddle/framework/variable_test.cc
@ -12,19 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 /*
  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 */
 #include <memory>
 #include <string>
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@ -21,8 +21,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 constexpr char kEPS = 1e-6;
 class BipartiteMatchOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@ -46,6 +44,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
  // The match_dist must be initialized to 0 at first.
  void BipartiteMatch(const Tensor& dist, int* match_indices,
                      T* match_dist) const {
    constexpr T kEPS = static_cast<T>(1e-6);
    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
    int64_t row = dist.dims()[0];
    int64_t col = dist.dims()[1];
--- a/paddle/operators/iou_similarity_op.cc
+++ b/paddle/operators/iou_similarity_op.cc
@ -0,0 +1,96 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/iou_similarity_op.h"
 namespace paddle {
 namespace operators {
 class IOUSimilarityOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of IOUSimilarityOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Y"),
                   "Input(Y) of IOUSimilarityOp should not be null.");
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
    ctx->ShareLoD("X", /*->*/ "Out");
    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
  }
 };
 class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor, default LoDTensor<float>) "
             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
             "each box is represented as [xmin, ymin, xmax, ymax], "
             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
             "coordinate of the box if the input is image feature map, they "
             "are close to the origin of the coordinate system. "
             "[xmax, ymax] is the right bottom coordinate of the box. "
             "This tensor can contain LoD information to represent a batch "
             "of inputs. One instance of this batch can contain different "
             "numbers of entities.");
    AddInput("Y",
             "(Tensor, default Tensor<float>) "
             "Box list Y holds M boxes, each box is represented as "
             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
             "[xmin, ymin] is the left top coordinate of the box if the "
             "input is image feature map, and [xmax, ymax] is the right "
             "bottom coordinate of the box.");
    AddOutput("Out",
              "(LoDTensor, the lod is same as input X) The output of "
              "iou_similarity op, a tensor with shape [N, M] "
              "representing pairwise iou scores.");
    AddComment(R"DOC(
 IOU Similarity Operator.
 Computes intersection-over-union (IOU) between two box lists.
 Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
 boxes in 'Y' are shared by all instance of the batched inputs of X.
 Given two boxes A and B, the calculation of IOU is as follows:
 $$
 IOU(A, B) = 
 \frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
 $$
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
                             ops::IOUSimilarityOpMaker);
 REGISTER_OP_CPU_KERNEL(
    iou_similarity,
    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/iou_similarity_op.cu
+++ b/paddle/operators/iou_similarity_op.cu
@ -0,0 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/iou_similarity_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    iou_similarity,
    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/iou_similarity_op.h
+++ b/paddle/operators/iou_similarity_op.h
@ -0,0 +1,90 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/for_range.h"
 template <typename T>
 inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
                                  T ymin2, T xmax2, T ymax2) {
  constexpr T zero = static_cast<T>(0);
  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
  T inter_height = inter_ymax - inter_ymin;
  T inter_width = inter_xmax - inter_xmin;
  inter_height = inter_height > zero ? inter_height : zero;
  inter_width = inter_width > zero ? inter_width : zero;
  T inter_area = inter_width * inter_height;
  T union_area = area1 + area2 - inter_area;
  T sim_score = inter_area / union_area;
  return sim_score;
 }
 template <typename T>
 struct IOUSimilarityFunctor {
  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
  inline HOSTDEVICE void operator()(size_t row_id) const {
    T x_min1 = x_[row_id * 4];
    T y_min1 = x_[row_id * 4 + 1];
    T x_max1 = x_[row_id * 4 + 2];
    T y_max1 = x_[row_id * 4 + 3];
    for (size_t i = 0; i < cols_; ++i) {
      T x_min2 = y_[i * 4];
      T y_min2 = y_[i * 4 + 1];
      T x_max2 = y_[i * 4 + 2];
      T y_max2 = y_[i * 4 + 3];
      T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
                            x_max2, y_max2);
      z_[row_id * cols_ + i] = sim;
    }
  }
  const T* x_;
  const T* y_;
  T* z_;
  const size_t cols_;
 };
 namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
 class IOUSimilarityKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
    int x_n = in_x->dims()[0];
    int y_n = in_y->dims()[0];
    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
    platform::ForRange<DeviceContext> for_range(
        static_cast<const DeviceContext&>(ctx.device_context()), x_n);
    for_range(functor);
  }
 };  // namespace operators
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@ -66,6 +66,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(boolean, default false) "
                  "Sparse update")
        .SetDefault(false);
    AddAttr<int64_t>("padding_idx",
                     "(int64, default -1) "
                     "If the value is -1, it makes no effect to lookup. "
                     "Otherwise the given value indicates padding the output "
                     "with zeros whenever lookup encounters it in Ids.")
        .SetDefault(-1);
    AddComment(R"DOC(
 Lookup Table Operator.
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@ -21,9 +21,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
          bool PaddingFlag>
 __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
-                            const int64_t N, const int64_t K, const int64_t D) {
+                            const int64_t N, const int64_t K, const int64_t D,
                            const int64_t padding_idx) {
  int idx = threadIdx.x;
  int idy = blockIdx.x + threadIdx.y * GridDimX;
@ -34,7 +36,14 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
    T* out = output + idy * D;
    const T* tab = table + id * D;
    for (int i = idx; i < D; i += BlockDimX) {
-      out[i] = tab[i];
+      if (PaddingFlag) {
        if (id == padding_idx)
          out[i] = static_cast<T>(0);
        else
          out[i] = tab[i];
      } else {
        out[i] = tab[i];
      }
    }
    idy += BlockDimY * GridDimX;
  }
@ -67,6 +76,7 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
    auto* table_t = context.Input<LoDTensor>("W");
    auto* ids_t = context.Input<LoDTensor>("Ids");
    auto* output_t = context.Output<LoDTensor>("Out");
    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
    size_t N = table_t->dims()[0];
    size_t D = table_t->dims()[1];
@ -77,10 +87,17 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
    dim3 threads(128, 8);
    dim3 grids(8, 1);
-    LookupTable<
+
-        T, 128, 8,
+    if (padding_idx == -1)
-        8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+      LookupTable<
-        output, table, ids, N, K, D);
+          T, 128, 8, 8,
          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
          output, table, ids, N, K, D, padding_idx);
    else
      LookupTable<
          T, 128, 8, 8,
          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
          output, table, ids, N, K, D, padding_idx);
  }
 };
@ -91,6 +108,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
    auto& dev_ctx =
        context.template device_context<platform::CUDADeviceContext>();
    bool is_sparse = context.Attr<bool>("is_sparse");
    // Since paddings are not trainable and fixed in forward, the gradient of
    // paddings makes no sense and we don't deal with it in backward.
    if (is_sparse) {
      auto* ids = context.Input<LoDTensor>("Ids");
      auto* table = context.Input<LoDTensor>("W");
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@ -32,16 +32,30 @@ class LookupTableKernel : public framework::OpKernel<T> {
    auto* table_t = context.Input<LoDTensor>("W");      // float tensor
    auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
    auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
    int N = table_t->dims()[0];
    int D = table_t->dims()[1];
    auto* ids = ids_t->data<int64_t>();
    auto* table = table_t->data<T>();
    auto* output = output_t->mutable_data<T>(context.GetPlace());
-    for (int64_t i = 0; i < ids_t->numel(); ++i) {
+
-      PADDLE_ENFORCE_LT(ids[i], N);
+    if (padding_idx == -1) {
-      PADDLE_ENFORCE_GE(ids[i], 0);
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
-      memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        PADDLE_ENFORCE_LT(ids[i], N);
        PADDLE_ENFORCE_GE(ids[i], 0);
        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
      }
    } else {
      for (int64_t i = 0; i < ids_t->numel(); ++i) {
        if (ids[i] == padding_idx) {
          memset(output + i * D, 0, D * sizeof(T));
        } else {
          PADDLE_ENFORCE_LT(ids[i], N);
          PADDLE_ENFORCE_GE(ids[i], 0);
          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
        }
      }
    }
  }
 };
@ -51,6 +65,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    bool is_sparse = context.Attr<bool>("is_sparse");
    // Since paddings are not trainable and fixed in forward, the gradient of
    // paddings makes no sense and we don't deal with it in backward.
    if (is_sparse) {
      auto* ids = context.Input<LoDTensor>("Ids");
      auto* table = context.Input<LoDTensor>("W");
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@ -124,7 +124,8 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
                              "This attribute only be used in unitest. Classes "
                              "in this list wiil be used as negative classes "
                              "for every samples. Under normal conditions, "
-                              "user should avoid setting this attribute.");
+                              "user should avoid setting this attribute.")
        .SetDefault({});
    AddComment(R"DOC(
 Compute and return the noise-contrastive estimation training loss.
 See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
@ -197,7 +197,8 @@ class NCEGradKernel : public framework::OpKernel<T> {
    // get d_x
    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
    if (d_x != nullptr) {
-      d_x->mutable_data<T>(context.GetPlace());
+      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
      std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
      auto d_x_matrix = EigenMatrix<T>::From(*d_x);
      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
--- a/paddle/operators/prior_box_op.cc
+++ b/paddle/operators/prior_box_op.cc
@ -0,0 +1,154 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/prior_box_op.h"
 namespace paddle {
 namespace operators {
 class PriorBoxOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(Input) of PriorBoxOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Image"),
                   "Input(Image) of PriorBoxOp should not be null.");
    auto image_dims = ctx->GetInputDim("Image");
    auto input_dims = ctx->GetInputDim("Input");
    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
                      "The height of input must smaller than image.");
    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
                      "The width of input must smaller than image.");
    auto min_sizes = ctx->Attrs().Get<std::vector<int>>("min_sizes");
    auto max_sizes = ctx->Attrs().Get<std::vector<int>>("max_sizes");
    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
    bool flip = ctx->Attrs().Get<bool>("flip");
    PADDLE_ENFORCE_GT(min_sizes.size(), 0,
                      "Size of min_sizes must be at least 1.");
    for (size_t i = 0; i < min_sizes.size(); ++i) {
      PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
    }
    std::vector<float> aspect_ratios_vec;
    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
    int num_priors = aspect_ratios_vec.size() * min_sizes.size();
    if (max_sizes.size() > 0) {
      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
                        "The number of min_size and max_size must be equal.");
      for (size_t i = 0; i < min_sizes.size(); ++i) {
        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
                          "max_size[%d] must be greater than min_size[%d].", i,
                          i);
        num_priors += 1;
      }
    }
    PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
    for (size_t i = 0; i < variances.size(); ++i) {
      PADDLE_ENFORCE_GT(variances[i], 0.0,
                        "variance[%d] must be greater than 0.", i);
    }
    const float step_h = ctx->Attrs().Get<float>("step_h");
    PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
    const float step_w = ctx->Attrs().Get<float>("step_w");
    PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
    std::vector<int64_t> dim_vec(4);
    dim_vec[0] = input_dims[2];
    dim_vec[1] = input_dims[3];
    dim_vec[2] = num_priors;
    dim_vec[3] = 4;
    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
  }
 };
 class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input",
             "(Tensor, default Tensor<float>), "
             "the input feature data of PriorBoxOp, The layout is NCHW.");
    AddInput("Image",
             "(Tensor, default Tensor<float>), "
             "the input image data of PriorBoxOp, The layout is NCHW.");
    AddOutput("Boxes",
              "(Tensor, default Tensor<float>), the output prior boxes of "
              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
              "H is the height of input, W is the width of input, num_priors "
              "is the box count of each position.");
    AddOutput("Variances",
              "(Tensor, default Tensor<float>), the expanded variances of "
              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
              "H is the height of input, W is the width of input, num_priors "
              "is the box count of each position.");
    AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
                              "List of min sizes of generated prior boxes.");
    AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ",
                              "List of max sizes of generated prior boxes.");
    AddAttr<std::vector<float>>(
        "aspect_ratios", "(vector<float>) ",
        "List of aspect ratios of generated prior boxes.");
    AddAttr<std::vector<float>>(
        "variances", "(vector<float>) ",
        "List of variances to be encoded in prior boxes.");
    AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.")
        .SetDefault(true);
    AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.")
        .SetDefault(true);
    AddAttr<float>("step_w",
                   "Prior boxes step across width, 0 for auto calculation.")
        .SetDefault(0.0);
    AddAttr<float>("step_h",
                   "Prior boxes step across height, 0 for auto calculation.")
        .SetDefault(0.0);
    AddAttr<float>("offset",
                   "(float) "
                   "Prior boxes center offset.")
        .SetDefault(0.5);
    AddComment(R"DOC(
 Prior box operator
 Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
 Each position of the input produce N prior boxes, N is determined by
 the count of min_sizes, max_sizes and aspect_ratios, The size of the
 box is in range(min_size, max_size) interval, which is generated in
 sequence according to the aspect_ratios.
 Please get more information from the following papers:
 https://arxiv.org/abs/1512.02325.
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
 REGISTER_OP_CPU_KERNEL(
    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/prior_box_op.h
+++ b/paddle/operators/prior_box_op.h
@ -0,0 +1,188 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/platform/transform.h"
 namespace paddle {
 namespace operators {
 inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                               bool flip,
                               std::vector<float>& output_aspect_ratior) {
  constexpr float epsilon = 1e-6;
  output_aspect_ratior.clear();
  output_aspect_ratior.push_back(1.);
  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
    float ar = input_aspect_ratior[i];
    bool already_exist = false;
    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
        already_exist = true;
        break;
      }
    }
    if (!already_exist) {
      output_aspect_ratior.push_back(ar);
      if (flip) {
        output_aspect_ratior.push_back(1. / ar);
      }
    }
  }
 }
 template <typename T>
 struct ClipFunctor {
  HOSTDEVICE T operator()(T in) const {
    return std::min<T>(std::max<T>(in, 0.), 1.);
  }
 };
 template <typename Place, typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
    auto min_sizes = ctx.Attr<std::vector<int>>("min_sizes");
    auto max_sizes = ctx.Attr<std::vector<int>>("max_sizes");
    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
    auto variances = ctx.Attr<std::vector<float>>("variances");
    auto flip = ctx.Attr<bool>("flip");
    auto clip = ctx.Attr<bool>("clip");
    std::vector<float> aspect_ratios;
    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
    T offset = static_cast<T>(ctx.Attr<float>("offset"));
    auto img_width = image->dims()[3];
    auto img_height = image->dims()[2];
    auto feature_width = input->dims()[3];
    auto feature_height = input->dims()[2];
    T step_width, step_height;
    if (step_w == 0 || step_h == 0) {
      step_width = static_cast<T>(img_width) / feature_width;
      step_height = static_cast<T>(img_height) / feature_height;
    } else {
      step_width = step_w;
      step_height = step_h;
    }
    int num_priors = aspect_ratios.size() * min_sizes.size();
    if (max_sizes.size() > 0) {
      num_priors += max_sizes.size();
    }
    boxes->mutable_data<T>(ctx.GetPlace());
    vars->mutable_data<T>(ctx.GetPlace());
    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
    for (int h = 0; h < feature_height; ++h) {
      for (int w = 0; w < feature_width; ++w) {
        T center_x = (w + offset) * step_width;
        T center_y = (h + offset) * step_height;
        T box_width, box_height;
        int idx = 0;
        for (size_t s = 0; s < min_sizes.size(); ++s) {
          int min_size = min_sizes[s];
          // first prior: aspect_ratio = 1, size = min_size
          box_width = box_height = min_size;
          // xmin
          e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
          // ymin
          e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
          // xmax
          e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
          // ymax
          e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
          idx++;
          if (max_sizes.size() > 0) {
            int max_size = max_sizes[s];
            // second prior: aspect_ratio = 1,
            // size = sqrt(min_size * max_size)
            box_width = box_height = sqrt(min_size * max_size);
            // xmin
            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
            // ymin
            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
            // xmax
            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
            // ymax
            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
            idx++;
          }
          // rest of priors
          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
            float ar = aspect_ratios[r];
            if (fabs(ar - 1.) < 1e-6) {
              continue;
            }
            box_width = min_size * sqrt(ar);
            box_height = min_size / sqrt(ar);
            // xmin
            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
            // ymin
            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
            // xmax
            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
            // ymax
            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
            idx++;
          }
        }
      }
    }
    if (clip) {
      platform::Transform<platform::CPUDeviceContext> trans;
      ClipFunctor<T> clip_func;
      trans(ctx.template device_context<platform::CPUDeviceContext>(),
            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
            boxes->data<T>(), clip_func);
    }
    framework::Tensor var_t;
    var_t.mutable_data<T>(
        framework::make_ddim({1, static_cast<int>(variances.size())}),
        ctx.GetPlace());
    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
    for (size_t i = 0; i < variances.size(); ++i) {
      var_et(0, i) = variances[i];
    }
    int box_num = feature_height * feature_width * num_priors;
    auto var_dim = vars->dims();
    vars->Resize({box_num, static_cast<int>(variances.size())});
    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
    vars->Resize(var_dim);
  }
 };  // namespace operators
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
@ -64,6 +64,8 @@ std::string AttrType(paddle::framework::proto::AttrType at) {
      return "bool array";
    case paddle::framework::proto::BLOCK:
      return "block id";
    case paddle::framework::proto::LONG:
      return "long";
  }
  return "UNKNOWN";  // not possible
 }
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@ -212,6 +212,7 @@ void BindVarDsec(py::module &m) {
             return name;
           },
           py::return_value_policy::reference)
      .def("set_name", &VarDesc::SetName)
      .def("set_shape", &VarDesc::SetShape)
      .def("set_dtype", &VarDesc::SetDataType)
      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
@ -280,7 +281,8 @@ void BindOpDesc(py::module &m) {
      .def("check_attrs", &OpDesc::CheckAttrs)
      .def("infer_shape", &OpDesc::InferShape)
      .def("infer_var_type", &OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDesc>);
+      .def("serialize_to_string", SerializeMessage<OpDesc>)
      .def("block", &OpDesc::Block, py::return_value_policy::reference);
 }
 }  // namespace pybind
--- a/Show More
+++ b/Show More