Merge remote-tracking branch 'origin/develop' into doc/api1

8 years ago · 4970414b1c
parent 7ad46ec03c 566a940223
commit 4970414b1c
32 changed files with 1058 additions and 446 deletions
--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@ -104,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
-关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
 ## 提交（commit）
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@ -64,7 +64,8 @@ class OpConverter {
    (*it)(op, scope, test_mode);
  }
-  // convert fluid block to tensorrt network
+  // Convert a fluid block to tensorrt network, NOTE it just convert operators,
  // the INetwork's inputs and outputs should specified in some other modules.
  void ConvertBlock(const framework::proto::BlockDesc& block,
                    const std::unordered_set<std::string>& parameters,
                    const framework::Scope& scope, TensorRTEngine* engine) {
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
    nvinfer1::Weights w_;
  };
-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
+  TensorRTEngine(int max_batch, int max_workspace,
                 cudaStream_t* stream = nullptr,
                 nvinfer1::ILogger& logger = NaiveLogger::Global())
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
-        stream_(stream),
+        stream_(stream ? stream : &default_stream_),
        logger_(logger) {}
  virtual ~TensorRTEngine();
@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
  // the max memory size the engine uses
  int max_workspace_;
  cudaStream_t* stream_;
  // If stream_ is not set from outside, hold its own stream.
  cudaStream_t default_stream_;
  nvinfer1::ILogger& logger_;
  std::vector<Buffer> buffers_;
@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
 */
 class TRT_EngineManager {
 public:
-  TensorRTEngine* Create(int max_batch, int max_workspace,
+  bool HasEngine(const std::string& name) const {
-                         cudaStream_t* stream) {
+    return engines_.count(name) != 0;
-    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
+  }
-    return engines_.back().get();
+
  // Get an engine called `name`.
  TensorRTEngine* Get(const std::string& name) const {
    return engines_.at(name).get();
  }
  // Create or get an engine called `name`
  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
                         const std::string& name) {
    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
    engines_[name].reset(p);
    return p;
  }
  void DeleteALl() {
-    for (auto& ptr : engines_) {
+    for (auto& item : engines_) {
-      ptr.reset(nullptr);
+      item.second.reset(nullptr);
    }
  }
 private:
-  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };
 }  // namespace tensorrt
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@ -112,7 +112,7 @@ $$out = \frac{1}{1 + e^{-x}}$$
 __attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
-$$out = \log \frac{1}{1 + e^{-x}}$$
+$$out = \\log \\frac{1}{1 + e^{-x}}$$
 )DOC";
@ -252,15 +252,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "Output of Softshrink operator");
    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
    AddComment(R"DOC(
-Softshrink Activation Operator.
+:strong:`Softshrink Activation Operator`
-$$
+..  math::
-out = \begin{cases} 
+    out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
+         x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
+         x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
+         0,  \text{otherwise}
-    \end{cases}
+         \end{cases}
 $$
 )DOC");
  }
@ -271,18 +270,18 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "Input of HardShrink operator");
    AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+    AddAttr<float>("threshold",
                   "The value of threshold for HardShrink. [default: 0.5]")
        .SetDefault(0.5f);
    AddComment(R"DOC(
-HardShrink Activation Operator.
+:strong:`HardShrink activation operator`
-$$
+..  math::
-out = \begin{cases} 
+    out = \begin{cases}
-    x, \text{if } x > \lambda \\
+            x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
+            x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
+            0,  \text{otherwise}
-    \end{cases}
+          \end{cases}
 $$
 )DOC");
  }
@ -394,18 +393,18 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "Input of ThresholdedRelu operator");
    AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold", "The threshold location of activation")
+    AddAttr<float>("threshold",
                   "The threshold location of activation. [default 1.0].")
        .SetDefault(1.0f);
    AddComment(R"DOC(
-ThresholdedRelu Activation Operator.
+:strong:`ThresholdedRelu activation operator`
-$$
+..  math::
 out = \begin{cases} 
    x, \text{if } x > threshold \\
    0,  \text{otherwise}
    \end{cases}
 $$
    out = \begin{cases}
             x,  \text{if } x > threshold \\
             0,  \text{otherwise}
          \end{cases}
 )DOC");
  }
 };
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@ -23,30 +23,26 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    OpComment comment;
-    AddInput("X",
+    AddInput("X", string::Sprintf("the left hand operand of %s operator",
-             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
+                                  comment.type));
-                             comment.type));
+    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
-    AddInput("Y", string::Sprintf(
+                                  comment.type));
                      "(LoDTensor) the right hand operand of %s operator",
                      comment.type));
    AddAttr<bool>("force_cpu",
-                  "(bool, default false) Force fill output variable to cpu "
+                  "Force fill output variable to cpu "
                  "memory. Otherwise, fill output variable to the running "
-                  "device")
+                  "device [default true].")
-        .SetDefault(false);
+        .SetDefault(true);
-    AddOutput("Out", string::Sprintf(
+    AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                                     comment.equation));
-                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(
    AddComment(string::Sprintf(R"DOC(%s Operator
 It operates element-wise on X and Y, and returns the Out. Each of them is a
 N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by %s
+calculated by $%s$
 )DOC",
-                               comment.type, comment.equation));
+                               comment.equation));
-    AddAttr<int>("axis",
+    AddAttr<int>(
-                 "(int, default -1). The start dimension index "
+        "axis",
-                 "for broadcasting Y onto X.")
+        "The start dimension index for broadcasting Y onto X. [default -1]")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
  }
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@ -30,19 +30,19 @@ class CumOp : public framework::OperatorWithKernel {
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "Input of Cumsum operator");
+    AddInput("X", "Input of cumsum operator");
-    AddOutput("Out", "Output of Cumsum operator");
+    AddOutput("Out", "Output of cumsum operator");
    AddAttr<int>("axis",
-                 "(int, default -1). The dimenstion to accumulate along. "
+                 "The dimenstion to accumulate along. -1 means the last "
-                 "-1 means the last dimenstion")
+                 "dimenstion [default -1].")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
    AddAttr<bool>("exclusive",
-                  "bool, default false). Whether to perform exclusive cumsum")
+                  "Whether to perform exclusive cumsum. [default false].")
        .SetDefault(false);
    AddAttr<bool>("reverse",
-                  "bool, default false). If true, the cumsum is performed in "
+                  "If true, the cumsum is performed in the reversed direction. "
-                  "the reversed direction")
+                  "[default false].")
        .SetDefault(false);
    AddComment(R"DOC(
 The cumulative sum of the elements along a given axis.
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@ -106,23 +106,36 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
              "and M represents the number of deocded boxes.");
    AddComment(R"DOC(
-Bounding Box Coder Operator.
+
 Bounding Box Coder.
 Encode/Decode the target bounding box with the priorbox information.
 The Encoding schema described below:
-ox = (tx - px) / pw / pxv
+
-oy = (ty - py) / ph / pyv
+    ox = (tx - px) / pw / pxv
-ow = log(abs(tw / pw)) / pwv 
+
-oh = log(abs(th / ph)) / phv 
+    oy = (ty - py) / ph / pyv
    ow = log(abs(tw / pw)) / pwv 
    oh = log(abs(th / ph)) / phv 
 The Decoding schema described below:
-ox = (pw * pxv * tx * + px) - tw / 2
+
-oy = (ph * pyv * ty * + py) - th / 2
+    ox = (pw * pxv * tx * + px) - tw / 2
-ow = exp(pwv * tw) * pw + tw / 2
+
-oh = exp(phv * th) * ph + th / 2
+    oy = (ph * pyv * ty * + py) - th / 2
-where tx, ty, tw, th denote the target box's center coordinates, width and
+
-height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
+    ow = exp(pwv * tw) * pw + tw / 2
-center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
+
-of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
+    oh = exp(phv * th) * ph + th / 2
-width and height.
+
 where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
 encoded/decoded coordinates, width and height.
 )DOC");
  }
 };
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@ -36,11 +36,12 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
  void Apply() override {
    AddAttr<float>("mean",
                   "(float, default 0.0) "
-                   "mean of random tensor.")
+                   "The mean (or center) of the gaussian distribution.")
        .SetDefault(.0f);
    AddAttr<float>("std",
                   "(float, default 1.0) "
-                   "std of random tensor.")
+                   "The standard deviation (std, or spread) of the "
                   "gaussian distribution.")
        .SetDefault(1.0f);
    AddAttr<int>("seed",
                 "(int, default 0) "
@ -55,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
        .SetDefault(framework::proto::VarType::FP32);
    AddComment(R"DOC(
 GaussianRandom Operator.
 Used to initialize tensors with gaussian random generator.
 The defalut mean of the distribution is 0. and defalut standard
 deviation (std) of the distribution is 1.. Uers can set mean and std
 by input arguments.
 )DOC");
  }
 };
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel {
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("X", "The input tensor.");
    AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "(optional) Scale is a 1-dimensional tensor of size "
             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
             "It is applied to the output.")
        .AsDispensable();
    AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "(optional) Bias is a 1-dimensional tensor of size "
             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
             "It is applied to the output.")
        .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
+    AddOutput("Y", "Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
+    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
-        .AsIntermediate();
+    AddOutput("Variance", "Variance of the current mini batch.")
    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
        .AsIntermediate();
    AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
+                   "Constant for numerical stability [default 1e-5].")
                   "numerical stability")
        .SetDefault(1e-5)
        .AddCustomChecker([](const float &epsilon) {
          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
                         "'epsilon' should be between 0.0 and 0.001.");
        });
    AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
+                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
+                 "matrix [N,H]. [default 1].")
        .SetDefault(1)
        .AddCustomChecker([](const int &begin_norm_axis) {
          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
        });
    AddComment(R"DOC(
-Layer Normalization.
+Assume feature vectors exist on dimensions
-Layer Norm has been implemented as discussed in the paper:
+:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
-https://arxiv.org/abs/1607.06450
+along these dimensions for each feature vector :math:`a` with size
-...
+:math:`H`, then normalize each feature vector using the corresponding
 statistics. After that, apply learnable gain and bias on the normalized
 tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
 Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
 )DOC");
  }
 };
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
 };
 void SignalHandler::StopAndExit(int signal_num) {
-  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  // Do not use VLOG here for the device for printing maybe already released.
  // exit will release interal allocated resoureces.
  exit(0);
 }
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "The input of mean op");
+    AddInput("X", "(Tensor) The input of mean op");
-    AddOutput("Out", "The output of mean op").Reuse("X");
+    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
    AddComment(R"DOC(
-Mean Operator.
+Mean Operator calculates the mean of all elements in X.
 Out is a scalar which is the mean of all elements in X. 
 )DOC");
  }
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@ -62,26 +62,46 @@ class MultiplexOp : public framework::OperatorWithKernel {
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("Ids", "The index tensor of multiplex operator.");
+    AddInput("Ids",
-    AddInput("X", "The candidate tensors of multiplex operator.")
+             "Tensor<int32>, index variable which is a 2-D tensor with shape "
             "[M, 1] where M is the batch size.");
    AddInput("X",
             "A list of variables to gather from. All variables have the same "
             "shape and the rank is at least 2.")
        .AsDuplicable();
    AddOutput("Out", "The output tensor of multiplex operator.");
    AddComment(R"DOC(
-Multiplex Operator.
+Referring to the given index variable, this layer selects rows from the
-
+input variables to construct a multiplex variable. Assuming that there are
-Multiplex multiple tensors according to the index provided by the index tensor.
+:math:`m` input variables and :math:`I_i` represents the i-th input
-
+variable and :math:`i` is in [0, :math:`m`). All input variables are
-Ids: the index tensor.
+tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
-X[0 : N - 1]: the candidate tensors for output (N >= 2).
+Please note that rank of the input tensor should be at least 2. Each input
-For each index i from 0 to batchSize - 1, the output is the i-th row of the
+variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
 where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
 * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
 variable. The given index variable should be a 2-D tensor with shape
 [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
 Then the output variable will be a tensor with shape [:math:`d_0`,
 :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
 matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
 row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
 * Ids: the index tensor.
 * X[0 : N - 1]: the candidate tensors for output (N >= 2).
 * For each index i from 0 to batchSize - 1, the output is the i-th row of the
 the (Ids[i])-th tensor.
 For i-th row of the output tensor:
-$$y[i] = x_{k}[i]$$
+$$
 y[i] = x_{k}[i]
 $$
-where `y` is the output tensor, `x_{k}` is the k-th input tensor,
+where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
-and `k = Ids[i]`.
+and $k = Ids[i]$.
 )DOC");
  }
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@ -78,11 +78,15 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
 protected:
  void Apply() override {
-    AddAttr<std::string>("filename", "The filename of record io reader");
+    AddAttr<std::string>(
        "filename",
        "The filename of record file. This file will given to reader.");
    AddComment(R"DOC(
-      CreateRecordIOReader Operator
+Open a recordio file and return the reader object. The returned reader object
 is thread-safe.
-      Create a reader from a record io file
+NOTE: This is a very low-level API. It is used for debugging data file or
 training. Please use `open_files` instead of this API for production usage.
    )DOC");
  }
 };
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@ -54,7 +54,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
 }
 void FileReaderMakerBase::Make() {
-  AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable();
+  AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
  AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
  AddAttr<std::vector<int>>(
      "ranks",
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@ -78,23 +78,23 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X",
-             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "the input(X) is a LodTensor, which supports "
             "variable time-length input sequences. The underlying tensor "
             "in this LoDTensor is a matrix with shape (T x N), where T "
             "is the total time steps in this mini-batch and N is the input "
             "data dimension.");
    AddInput("Filter",
-             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "the input(Filter) is a learnable parameter. It "
             "is a 2-D tensor with shape (future_context x N), where, "
             "future_context is the future context length and N is the data "
             "dimension.");
    AddOutput("Out",
-              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "the output(Out) is a LodTensor, which supports "
              "variable time-length input sequences. The underlying tensor "
              "in this LodTensor is a matrix with shape T x N, i.e., the "
              "same shape as X.");
    AddComment(R"DOC(
-Row-convolution Operator.
+:strong:`Row-convolution operator`
 The row convolution is called lookahead convolution.  This operator was 
 introduced in the following paper for DeepSpeech2:
@ -114,9 +114,23 @@ and a filter ($W$) of size $context \times d$,
 the output sequence is convolved as:
 $$
-out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
 $$
 In the above equation:
 * $Out_{i}$: The i-th row of output variable with shape [1, D].
 * $\\tau$: Future context size.
 * $X_{j}$: The j-th row of input variable with shape [1, D].
 * $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
 More details about row_conv please refer to
 the design document
 https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
 )DOC");
  }
 };
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@ -66,17 +66,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
 }  // namespace
 template <typename DeviceContext, typename T>
-void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
+void TensorRTEngineKernel<DeviceContext, T>::Prepare(
    const framework::ExecutionContext &context) const {
  VLOG(4) << "Prepare engine";
  // Get the ProgramDesc and pass to convert.
  framework::proto::BlockDesc block_desc;
  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-  max_batch_ = context.Attr<int>("max_batch");
+  int max_batch = context.Attr<int>("max_batch");
  auto max_workspace = context.Attr<int>("max_workspace");
-  engine_ = Singleton<TRT_EngineManager>::Global().Create(
+  auto params = context.Attr<std::vector<std::string>>("parameters");
-      max_batch_, max_workspace, &stream_);
+  std::unordered_set<std::string> parameters;
-  engine_->InitNetwork();
+  for (const auto &param : params) {
    parameters.insert(param);
  }
  // TODO(Superjomn) replace this with a different stream
  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
      context.Attr<std::string>("engine_uniq_key"));
  engine->InitNetwork();
  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
  // Add inputs
@ -87,24 +95,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                      "TensorRT engine only takes LoDTensor as input");
    auto shape = var->GetShape();
-    engine_->DeclareInput(
+    engine->DeclareInput(
        input, FluidDataType2TRT(
                   var->Proto()->type().lod_tensor().tensor().data_type()),
        Vec2TRT_Dims(var->GetShape()));
  }
  // TODO(Superjomn) parameters should be passed after analysised from outside.
  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block_desc, {}, context.scope(), engine_);
+      block_desc, parameters, context.scope(), engine);
  // Add outputs
  VLOG(4) << "declare outputs";
  for (auto &output : context.Outputs("Ys")) {
    VLOG(4) << "declare output " << output;
-    engine_->DeclareOutput(output);
+    engine->DeclareOutput(output);
  }
-  engine_->FreezeNetwork();
+  engine->FreezeNetwork();
 }
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@ -113,6 +120,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Xs", "A list of inputs.").AsDuplicable();
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
    AddAttr<int>("max_batch", "the maximum batch size.");
    AddAttr<int>("max_workspace", "the maximum batch size.");
    AddComment("TensorRT engine operator.");
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@ -19,10 +19,14 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 namespace paddle {
 namespace operators {
 using inference::Singleton;
 using inference::tensorrt::TRT_EngineManager;
 class TensorRTEngineOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@ -47,16 +51,18 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    if (!engine_) {
+    auto engine_name = context.Attr<std::string>("engine_uniq_key");
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);
    }
    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
    // Try to determine a batch_size
    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
        context.scope(), input_names.front());
    int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, max_batch_);
+    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
    // Convert input tensor from fluid to engine.
    for (const auto& x : context.Inputs("Xs")) {
@ -64,20 +70,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
          context.scope(), x);
      if (platform::is_cpu_place(t.place())) {
-        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+                                t.memory_size());
      } else {
-        engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+                                t.memory_size());
      }
    }
    // Execute the engine.
    PADDLE_ENFORCE_GT(batch_size, 0);
-    engine_->Execute(batch_size);
+    engine->Execute(batch_size);
    // Convert output tensor from engine to fluid
    for (const auto& y : context.Outputs("Ys")) {
      // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
      auto dims = trt_t->getDimensions();
      // Use the output ITensor's dims to reshape the Fluid Tensor.
      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@ -89,27 +95,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
      if (platform::is_cpu_place(fluid_t->place())) {
        // TODO(Superjomn) change this float to dtype size.
-        engine_->GetOutputInCPU(
+        engine->GetOutputInCPU(
            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
            size * sizeof(float));
      } else {
-        engine_->GetOutputInGPU(
+        engine->GetOutputInGPU(
            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
            size * sizeof(float));
      }
    }
-    cudaStreamSynchronize(stream_);
+    cudaStreamSynchronize(*engine->stream());
  }
 protected:
  // Build the engine.
  void Prepare(const framework::ExecutionContext& context) const;
 private:
  mutable cudaStream_t stream_;
  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
  mutable int max_batch_{0};
 };
 }  // namespace operators
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@ -79,6 +79,17 @@ void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
  attr->set_type(paddle::framework::proto::AttrType::LONG);
  attr->set_l(data);
 }
 template <>
 void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
                                       const std::string& name,
                                       const std::vector<std::string>& data) {
  auto* attr = op->add_attrs();
  attr->set_name(name);
  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
  for (const auto& s : data) {
    attr->add_strings(s.c_str());
  }
 }
 }  // namespace
@ -123,11 +134,15 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                    std::vector<std::string>({}));
  LOG(INFO) << "create engine op";
  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
  LOG(INFO) << "engine_op " << engine_op.get();
  framework::Scope scope;
  platform::CPUPlace place;
@ -145,6 +160,88 @@ TEST(TensorRTEngineOp, manual) {
  engine_op->Run(scope, place);
 }
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
  framework::ProgramDesc program;
  framework::Scope scope;
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);
  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);
  block_->set_parent_idx(-1);
  using shape_t = std::vector<int64_t>;
  LOG(INFO) << "create block desc";
  framework::BlockDesc block_desc(&program, block_);
  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
                        const std::string& z_name, bool x_created,
                        const shape_t& x_shape, const shape_t& y_shape,
                        const shape_t& z_shape) {
    LOG(INFO) << "create fc op";
    auto* fc = block_desc.AppendOp();
    fc->SetType("mul");
    fc->SetInput("X", std::vector<std::string>({x_name}));
    fc->SetInput("Y", std::vector<std::string>({y_name}));
    fc->SetOutput("Out", std::vector<std::string>({z_name}));
    // Set inputs' variable shape in BlockDesc
    if (!x_created) {
      AddTensorToBlockDesc(block_, x_name,
                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
    }
    AddTensorToBlockDesc(block_, y_name,
                         std::vector<int64_t>({input_dim, output_dim}));
    AddTensorToBlockDesc(block_, z_name,
                         std::vector<int64_t>({batch_size, output_dim}));
    // Prepare variables.
    if (!x_created) {
      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
    }
    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
    // It is wired, need to copy manually.
    *block_->add_ops() = *fc->Proto();
  };
  // Test with 4 layer FC
  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
             {input_dim, output_dim}, {batch_size, output_dim});
  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
             {batch_size, output_dim});
  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
             {batch_size, output_dim});
  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
             {batch_size, output_dim});
  LOG(INFO) << "create tensorrt desc";
  framework::OpDesc engine_op_desc(nullptr);
  engine_op_desc.SetType("tensorrt_engine");
  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
  SetAttr<std::vector<std::string>>(
      engine_op_desc.Proto(), "parameters",
      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
  // Execute them.
  engine_op->Run(scope, place);
 }
 // Test with a larger FC layer.
 TEST(TensorRTEngineOp, fc) { Execute(40, 256, 256); }
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@ -86,32 +86,24 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
+    AddOutput("Out", "The output tensor of uniform random op");
    AddComment(R"DOC(
 Uniform random operator.
 This operator initializes a tensor with random values sampled from a
-uniform distribution.
+uniform distribution. The random result is in set [min, max].
 )DOC");
-    AddAttr<std::vector<int>>("shape",
+    AddAttr<std::vector<int>>("shape", "The shape of the output tensor");
-                              "(vector<int>) The shape of the output tensor");
+    AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
    AddAttr<float>("min",
                   "(float, default -1.0) "
                   "Minimum value of uniform random")
        .SetDefault(-1.0f);
-    AddAttr<float>("max",
+    AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
                   "(float, default 1.0) "
                   "Maximun value of uniform random")
        .SetDefault(1.0f);
    AddAttr<int>("seed",
                 "(int, default 0) "
                 "Random seed used for generating samples. "
                 "0 means use a seed generated by the system."
                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
+                 "generate the same random numbers every time. [default 0].")
        .SetDefault(0);
-    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+    AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
        .SetDefault(framework::proto::VarType::FP32);
  }
 };
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@ -15,11 +15,13 @@
 import framework
 import numpy as np
 import contextlib
 from framework import convert_np_dtype_to_dtype_
 from core import VarDesc
 __all__ = [
-    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'force_init_on_cpu',
    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
-    'NormalInitializer', 'XavierInitializer'
+    'NormalInitializer', 'XavierInitializer', 'BilinearInitializer'
 ]
 _force_init_on_cpu_ = False
@ -422,6 +424,101 @@ class MSRAInitializer(Initializer):
        return op
 class BilinearInitializer(Initializer):
    """Implements the bilinear initializer.
    This initializer can be used in transposed convolution operator to
    act as upsampling. Users can upsample a feature map with shape of
    (B, C, H, W) by any integer factor. The usage is:
    >>>  factor = 2
    >>>  w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
    >>>                     initializer=Bilinear())
    >>>  conv_up = fluid.layers.conv2d_transpose(
    >>>      input,
    >>>      num_filters=C,
    >>>      output_size=None,
    >>>      filter_size=2 * factor - factor % 2,
    >>>      padding=ceil((factor - 1) / 2.),
    >>>      stride=factor,
    >>>      groups=C,
    >>>      param_attr=w_attr,
    >>>      bias_attr=False)
    Where, `num_filters=C` and `groups=C` means this is channel-wise tranposed
    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
    This initializer will set a (K, K) interpolation kernel for every channel
    of the filter identically. The resulting shape of the output feature map
    will be (B, C, factor * H, factor * W). Note that the learning rate and the
    weight decay are set to 0 in order to keep coefficient values of bilinear
    interpolation unchanged during training. 
    """
    def __init__(self):
        """Constructor for BilinearInitializer.
        """
        super(BilinearInitializer, self).__init__()
    def __call__(self, var, block):
        """Add biliear initialization ops for a variable
        Args:
            var (Variable): Variable that needs to be initialized.
            block (Block): The block in which initialization ops should
                           be added.
        Returns:
            the initialization op
        Raises:
            ValueError: If type of `var` and `block` is not right.
                        If the shape of `var` size is not 4 and
                        var.shape[2] != var.shape[3].
        """
        if not isinstance(var, framework.Variable):
            raise ValueError("var must be framework.Variable.")
        if not isinstance(block, framework.Block):
            raise ValueError("block must be framework.Block.")
        shape = var.shape
        if len(shape) != 4:
            raise ValueError("the length of shape must be 4.")
        if shape[2] != shape[3]:
            raise ValueError("shape[2] must be equal to shape[3].")
        weight = np.zeros(np.prod(var.shape), dtype='float32')
        size = shape[3]
        # factor
        f = np.ceil(size / 2.)
        # center
        c = (2 * f - 1 - f % 2) / (2. * f)
        for i in range(np.prod(shape)):
            x = i % size
            y = (i / size) % size
            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
        weight = np.reshape(weight, shape)
        if var.dtype == VarDesc.VarType.FP32:
            value_name = "fp32_values"
            values = [float(v) for v in weight.flat]
        else:
            raise ValueError("Unsupported dtype %s", input.dtype)
        if np.prod(shape) > 1024 * 1024:
            raise ValueError("The size of input is too big. ")
        op = block.append_op(
            type='assign_value',
            outputs={'Out': [var]},
            attrs={
                'dtype': var.dtype,
                'shape': list(shape),
                value_name: values
            })
        var.op = op
        return op
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
@ -436,3 +533,4 @@ Uniform = UniformInitializer
 Normal = NormalInitializer
 Xavier = XavierInitializer
 MSRA = MSRAInitializer
 Bilinear = BilinearInitializer
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@ -210,53 +210,68 @@ def bipartite_match(dist_matrix,
                    dist_threshold=None,
                    name=None):
    """
-    **Bipartite matchint operator**
+    This operator implements a greedy bipartite matching algorithm, which is
-
+    used to obtain the matching with the maximum distance based on the input
    This operator is a greedy bipartite matching algorithm, which is used to
    obtain the matching with the maximum distance based on the input
    distance matrix. For input 2D matrix, the bipartite matching algorithm can
-    find the matched column for each row, also can find the matched row for
+    find the matched column for each row (matched means the largest distance),
-    each column. And this operator only calculate matched indices from column
+    also can find the matched row for each column. And this operator only
-    to row. For each instance, the number of matched indices is the number of
+    calculate matched indices from column to row. For each instance,
-    of columns of the input ditance matrix.
+    the number of matched indices is the column number of the input distance
-
+    matrix.
-    There are two outputs to save matched indices and distance.
+
-    A simple description, this algothrim matched the best (maximum distance)
+    There are two outputs, matched indices and distance.
    A simple description, this algorithm matched the best (maximum distance)
    row entity to the column entity and the matched indices are not duplicated
    in each row of ColToRowMatchIndices. If the column entity is not matched
    any row entity, set -1 in ColToRowMatchIndices.
-    Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+    NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
    If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
    If Tensor, the height of ColToRowMatchIndices is 1.
    NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
    layer. Please consider to use :code:`ssd_loss` instead.
    Args:
        dist_matrix(Variable): This input is a 2-D LoDTensor with shape
            [K, M]. It is pair-wise distance matrix between the entities
            represented by each row and each column. For example, assumed one
            entity is A with shape [K], another entity is B with shape [M]. The
-            dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger
+            dist_matrix[i][j] is the distance between A[i] and B[j]. The bigger
-            the distance is, the better macthing the pairs are. Please note,
+            the distance is, the better matching the pairs are.
-            This tensor can contain LoD information to represent a batch of
+
-            inputs. One instance of this batch can contain different numbers of
+            NOTE: This tensor can contain LoD information to represent a batch
-            entities.
+            of inputs. One instance of this batch can contain different numbers
            of entities.
        match_type(string|None): The type of matching method, should be
-           'bipartite' or 'per_prediction', 'bipartite' by defalut.
+           'bipartite' or 'per_prediction'. [default 'bipartite'].
        dist_threshold(float|None): If `match_type` is 'per_prediction',
            this threshold is to determine the extra matching bboxes based
-            on the maximum distance, 0.5 by defalut.
+            on the maximum distance, 0.5 by default.
    Returns:
-        match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
+        tuple: a tuple with two elements is returned. The first is
-            N is the batch size. If match_indices[i][j] is -1, it
+        matched_indices, the second is matched_distance.
-            means B[j] does not match any entity in i-th instance.
+
-            Otherwise, it means B[j] is matched to row
+        The matched_indices is a 2-D Tensor with shape [N, M] in int type.
-            match_indices[i][j] in i-th instance. The row number of
+        N is the batch size. If match_indices[i][j] is -1, it
-            i-th instance is saved in match_indices[i][j].
+        means B[j] does not match any entity in i-th instance.
-        match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
+        Otherwise, it means B[j] is matched to row
-            N is batch size. If match_indices[i][j] is -1,
+        match_indices[i][j] in i-th instance. The row number of
-            match_distance[i][j] is also -1.0. Otherwise, assumed
+        i-th instance is saved in match_indices[i][j].
-            match_distance[i][j] = d, and the row offsets of each instance
+
-            are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j].
+        The matched_distance is a 2-D Tensor with shape [N, M] in float type
        . N is batch size. If match_indices[i][j] is -1,
        match_distance[i][j] is also -1.0. Otherwise, assumed
        match_distance[i][j] = d, and the row offsets of each instance
        are called LoD. Then match_distance[i][j] =
        dist_matrix[d+LoD[i]][j].
    Examples:
        >>> x = fluid.layers.data(name='x', shape=[4], dtype='float32')
        >>> y = fluid.layers.data(name='y', shape=[4], dtype='float32')
        >>> iou = fluid.layers.iou_similarity(x=x, y=y)
        >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
    """
    helper = LayerHelper('bipartite_match', **locals())
    match_indices = helper.create_tmp_variable(dtype='int32')
@ -364,7 +379,7 @@ def ssd_loss(location,
             normalize=True,
             sample_size=None):
    """
-    **Multi-box loss layer for object dection algorithm of SSD**
+    **Multi-box loss layer for object detection algorithm of SSD**
    This layer is to compute dection loss for SSD given the location offset
    predictions, confidence predictions, prior boxes and ground-truth boudding
@ -372,21 +387,35 @@ def ssd_loss(location,
    is a weighted sum of the localization loss (or regression loss) and
    confidence loss (or classification loss) by performing the following steps:
-    1. Find matched boundding box by bipartite matching algorithm.
+    1. Find matched bounding box by bipartite matching algorithm.
      1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
      1.2 Compute matched boundding box by bipartite matching algorithm.
    2. Compute confidence for mining hard examples
      2.1. Get the target label based on matched indices.
      2.2. Compute confidence loss.
    3. Apply hard example mining to get the negative example indices and update
       the matched indices.
    4. Assign classification and regression targets
      4.1. Encoded bbox according to the prior boxes.
      4.2. Assign regression targets.
      4.3. Assign classification targets.
    5. Compute the overall objective loss.
      5.1 Compute confidence loss.
      5.1 Compute localization loss.
      5.3 Compute the overall weighted loss.
    Args:
@ -421,39 +450,36 @@ def ssd_loss(location,
        mining_type (str): The hard example mining type, should be 'hard_example'
            or 'max_negative', now only support `max_negative`.
        normalize (bool): Whether to normalize the SSD loss by the total number
-            of output locations, True by defalut.
+            of output locations, True by default.
        sample_size (int): The max sample size of negative box, used only when
            mining_type is 'hard_example'.
    Returns:
-        Variable: The weighted sum of the localization loss and confidence loss,
+        The weighted sum of the localization loss and confidence loss, with \
-            with shape [N * Np, 1], N and Np are the same as they are
+        shape [N * Np, 1], N and Np are the same as they are in `location`.
            in `location`.
    Raises:
-        ValueError: If mining_type is 'hard_example', now only support
+        ValueError: If mining_type is 'hard_example', now only support mining \
-            mining type of `max_negative`.
+        type of `max_negative`.
    Examples:
-        .. code-block:: python
+        >>> pb = fluid.layers.data(
-
+        >>>                   name='prior_box',
-            pb = layers.data(
+        >>>                   shape=[10, 4],
-                name='prior_box',
+        >>>                   append_batch_size=False,
-                shape=[10, 4],
+        >>>                   dtype='float32')
-                append_batch_size=False,
+        >>> pbv = fluid.layers.data(
-                dtype='float32')
+        >>>                   name='prior_box_var',
-            pbv = layers.data(
+        >>>                   shape=[10, 4],
-                name='prior_box_var',
+        >>>                   append_batch_size=False,
-                shape=[10, 4],
+        >>>                   dtype='float32')
-                append_batch_size=False,
+        >>> loc = fluid.layers.data(name='target_box', shape=[10, 4], dtype='float32')
-                dtype='float32')
+        >>> scores = fluid.layers.data(name='scores', shape=[10, 21], dtype='float32')
-            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
+        >>> gt_box = fluid.layers.data(
-            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
+        >>>         name='gt_box', shape=[4], lod_level=1, dtype='float32')
-            gt_box = layers.data(
+        >>> gt_label = fluid.layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
+        >>>         name='gt_label', shape=[1], lod_level=1, dtype='float32')
-            gt_label = layers.data(
+        >>> loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
                name='gt_label', shape=[1], lod_level=1, dtype='float32')
            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
    """
    helper = LayerHelper('ssd_loss', **locals())
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@ -22,9 +22,9 @@ from ..executor import global_scope
 from layer_function_generator import generate_layer_fn, templatedoc
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
+    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
-    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
+    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
-    'random_data_generator', 'Preprocessor', 'load'
+    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
 ]
@ -177,18 +177,17 @@ class ListenAndServ(object):
            })
-def Send(endpoints, send_vars, get_vars=None):
+def Send(endpoints, send_vars, sync=True):
    """
-    Send layer
+    Send variables to the server side, and get vars from server
    side when server have finished running server side program.
    Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                   of send_vars to send
-        send_vars: vars to send
+        send_vars (list): variables to send to server
-        get_vars: vars to get from server after send completes.
+        sync (bool): whether to wait the request finish
-
+    
    Send variables to the server side, and get vars from server
    side when server have finished running server side program.
    """
    assert (type(send_vars) == list)
@ -196,40 +195,33 @@ def Send(endpoints, send_vars, get_vars=None):
    endpoints = list(set(epmap))
    helper = LayerHelper("Send", **locals())
    if not get_vars:
        get_vars = []
        for s in send_vars:
            v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
            get_vars.append(v)
    rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
    helper.append_op(
        type="send",
        inputs={"X": send_vars},
        outputs={"Out": get_vars},
        attrs={
            "endpoints": endpoints,
            "epmap": epmap,
            rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
        })
-
+    if sync:
-    return get_vars
+        helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
-def Recv(endpoints, get_vars):
+def Recv(endpoints, get_vars, sync=True):
    """
-    Recv layer
+    Receive variables from server side
    Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                   of send_vars to send
-        send_vars: vars to send
+        get_vars (list): vars to get from server after send completes.
-        get_vars: vars to get from server after send completes.
+        sync (bool): whether to wait the request finish
-    Send variables to the server side, and get vars from server
+    Returns:
-    side when server have finished running server side program.
+        list: list of received variables
    """
    assert (type(send_vars) == list)
    assert (type(get_vars) == list)
    epmap = endpoints.split(",")
@ -242,6 +234,9 @@ def Recv(endpoints, get_vars):
        outputs={"Out": get_vars},
        attrs={"endpoints": endpoints,
               "epmap": epmap})
    if sync:
        helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints})
    return get_vars
 def monkey_patch_reader_methods(reader):
@ -292,6 +287,7 @@ def _copy_reader_create_op_(block, op):
    return new_op
@templatedoc(op_type='create_recordio_file_reader')
 def open_recordio_file(filename,
                       shapes,
                       lod_levels,
@ -299,34 +295,30 @@ def open_recordio_file(filename,
                       pass_num=1,
                       for_parallel=True):
    """
-    Open a RecordIO file
+    ${comment}
    This layer takes a RecordIO file to read from and returns a Reader Variable.
    Via the Reader Variable, we can get data from the given RecordIO file.
    Args:
-       filename(str): The RecordIO file's name.
+       filename(${filename_type}): ${filename_comment}.
       shapes(list): List of tuples which declaring data shapes.
-       lod_levels(list): List of ints which declaring data lod_level.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
       dtypes(list): List of strs which declaring data type.
       pass_num(int): Number of passes to run.
       for_parallel(Bool): Set it as True if you are going to run
            subsequent operators in parallel.
    Returns:
-       Variable: A Reader Variable via which we can get RecordIO file data.
+       ${out_comment}.
    Examples:
       .. code-block:: python
         reader = fluid.layers.io.open_recordio_file(
                                          filename='./data.recordio',
                                          shapes=[(3,224,224), (1)],
                                          lod_levels=[0, 0],
                                          dtypes=['float32', 'int64'])
-         # Via the reader, we can use 'read_file' layer to get data:
+        >>> import paddle.fluid as fluid
-         image, label = fluid.layers.io.read_file(reader)
+        >>> reader = fluid.layers.io.open_recordio_file(
        >>>                               filename='./data.recordio',
        >>>                               shapes=[(3,224,224), (1)],
        >>>                               lod_levels=[0, 0],
        >>>                               dtypes=['float32', 'int64'])
        >>> # Via the reader, we can use 'read_file' layer to get data:
        >>> image, label = fluid.layers.io.read_file(reader)
    """
    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
    shape_concat = []
@ -386,16 +378,16 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
       Variable: A Reader Variable from which we can get random data.
    Examples:
       .. code-block:: python
-         reader = fluid.layers.io.random_data_generator(
+        .. code-block:: python
                                          low=0.0,
                                          high=1.0,
                                          shapes=[(3,224,224), (1)],
                                          lod_levels=[0, 0])
-         # Via the reader, we can use 'read_file' layer to get data:
+            reader = fluid.layers.random_data_generator(
-         image, label = fluid.layers.io.read_file(reader)
+                                             low=0.0,
                                             high=1.0,
                                             shapes=[[3,224,224], [1]],
                                             lod_levels=[0, 0])
            # Via the reader, we can use 'read_file' layer to get data:
            image, label = fluid.layers.read_file(reader)
    """
    dtypes = [core.VarDesc.VarType.FP32] * len(shapes)
    shape_concat = []
@ -544,6 +536,9 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
 def shuffle(reader, buffer_size):
    """
    Shuffle the reader.
    """
    return __create_unshared_decorated_reader__(
        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
@ -554,6 +549,29 @@ def batch(reader, batch_size):
 def double_buffer(reader, place=None, name=None):
    """
    Wrap a double buffer reader. The data will copy to target place with a
    double buffer queue. If the target place is None, the place that executor
    perform on will be used.
    Args:
        reader(Variable): the reader variable need to be wrapped.
        place(Place): the place of target data. Default is the sample place of
            executor perform.
        name(str): Variable name. None if the user does not care.
    Returns:
        wrapped reader with double buffer.
    Examples:
        >>> reader = fluid.layers.open_files(filenames=['somefile'],
        >>>                                  shapes=[[-1, 784], [-1, 1]],
        >>>                                  dtypes=['float32', 'int64'])
        >>> reader = fluid.layers.double_buffer(reader)
        >>> img, label = fluid.layers.read_file(reader)
    """
    attrs = dict()
    if place is not None:
        attrs['place'] = str(place).upper()
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@ -44,6 +44,11 @@ def _type_to_str_(tp):
    return framework_pb2.AttrType.Name(tp)
 _two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$")
 _single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$")
 _two_bang_pattern_ = re.compile(r"!!([^!]+)!!")
 def _generate_doc_string_(op_proto):
    """
    Generate docstring by OpProto
@ -55,22 +60,26 @@ def _generate_doc_string_(op_proto):
        str: the document string
    """
    def escape_math(text):
        return _two_bang_pattern_.sub(
            r'$$\1$$',
            _single_dollar_pattern_.sub(
                r':math:`\1`', _two_dollar_pattern_.sub(r"!!\1!!", text)))
    if not isinstance(op_proto, framework_pb2.OpProto):
        raise TypeError("OpProto should be `framework_pb2.OpProto`")
    buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
+    buf.write(escape_math(op_proto.comment))
    buf.write('\nArgs:\n')
    for each_input in op_proto.inputs:
        line_begin = '    {0}: '.format(_convert_(each_input.name))
        buf.write(line_begin)
-        buf.write(each_input.comment)
+        buf.write(escape_math(each_input.comment))
-        buf.write('\n')
+        if each_input.duplicable:
-        buf.write(' ' * len(line_begin))
+            buf.write("  Duplicatable.")
-        buf.write('Duplicable: ')
+        if each_input.dispensable:
-        buf.write(str(each_input.duplicable))
+            buf.write("  Optional.")
        buf.write('  Optional: ')
        buf.write(str(each_input.dispensable))
        buf.write('\n')
    skip_attrs = OpProtoHolder.generated_op_attr_names()
@ -83,7 +92,7 @@ def _generate_doc_string_(op_proto):
        buf.write(' (')
        buf.write(_type_to_str_(each_attr.type))
        buf.write('): ')
-        buf.write(each_attr.comment)
+        buf.write(escape_math(each_attr.comment))
        buf.write('\n')
    if len(op_proto.outputs) != 0:
@ -92,7 +101,7 @@ def _generate_doc_string_(op_proto):
        for each_opt in op_proto.outputs:
            if not each_opt.intermediate:
                break
-        buf.write(each_opt.comment)
+        buf.write(escape_math(each_opt.comment))
    return buf.getvalue()
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/Show More
+++ b/Show More