Merge branch 'develop' into type_alias

8 years ago · 91e8aadaef
parent aa5090de3a e31a469ee0
commit 91e8aadaef
35 changed files with 1276 additions and 110 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -38,7 +38,7 @@ before_install:
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
-  - pip install rarfile
+  - pip install rarfile nltk==3.2.2 scipy==0.19.0 recordio matplotlib Pillow
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - go get -u github.com/alecthomas/gometalinter
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -257,6 +257,16 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
    :noindex:

+kmax_sequence_score
+-------------------
+..  autoclass:: paddle.v2.layer.kmax_sequence_score
+    :noindex:
+
+sub_nested_seq
+--------------
+..  autoclass:: paddle.v2.layer.sub_nested_seq
+    :noindex:
+
 Reshaping Layers
 ================

--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@ -13,15 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PROJ_ROOT@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2

 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@ -13,15 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PROJ_ROOT@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2


 MarkdownParser = parser.CommonMarkParser
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -40,11 +40,12 @@ if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
    SRCS pybind.cc
    DEPS pybind python backward
-	fc_op
-	sgd_op
-	add_op
-	mean_op
-	cross_entropy_op
-	fill_zeros_like_op
-	recurrent_op)
+    fc_op
+    sgd_op
+    add_op
+    mean_op
+    cross_entropy_op
+    recurrent_op
+    uniform_random_op
+    fill_zeros_like_op)
 endif(WITH_PYTHON)
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -260,12 +260,6 @@ class OpRegistry {
    return CreateOp(op_desc.type(), inputs, outputs, attrs);
  }

-  static bool SupportGPU(const std::string& op_type) {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = platform::GPUPlace();
-    return OperatorWithKernel::AllOpKernels().at(op_type).count(key) != 0;
-  }
-
  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
    PADDLE_ENFORCE(!op.IsNetOp(),
                   "Use framework::Backward to get backward ops");
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -88,6 +88,8 @@ class OperatorBase {

  virtual bool IsNetOp() const { return false; }

+  virtual bool SupportGPU() const { return false; }
+
  /// rename inputs outputs name
  void Rename(const std::string& old_name, const std::string& new_name);

@ -308,7 +310,7 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;

-  void InferShape(const Scope& scope) const {
+  void InferShape(const Scope& scope) const override {
    InferShape(InferShapeContext(this, scope));
  }

@ -324,6 +326,12 @@ class OperatorWithKernel : public OperatorBase {
    return g_all_op_kernels;
  }

+  bool SupportGPU() const override {
+    OperatorWithKernel::OpKernelKey key;
+    key.place_ = platform::GPUPlace();
+    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+  }
+
 protected:
  virtual void InferShape(const InferShapeContext& ctx) const = 0;
 };
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@ -39,6 +39,7 @@ USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
 USE_OP_WITHOUT_KERNEL(recurrent_op);
+USE_OP(uniform_random);
 namespace paddle {
 namespace framework {

@ -56,6 +57,26 @@ void ExposeOperator(ClassType &m) {
           [](const typename ClassType::type &op) -> std::vector<std::string> {
             return op.outputs_;
           })
+      .def("inputs",
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
+             return op.inputs_;
+           })
+      .def("support_gpu", &ClassType::type::SupportGPU)
+      .def("temp_outputs",
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
+             auto iter = op.attrs_.find("temporary_index");
+             std::vector<std::string> ret;
+             if (iter == op.attrs_.end()) {
+               return ret;
+             } else {
+               auto tmp_idx = boost::get<std::vector<int>>(iter->second);
+               ret.reserve(tmp_idx.size());
+               for (auto &index : tmp_idx) {
+                 ret.push_back(op.outputs_.at(index));
+               }
+               return ret;
+             }
+           })
      .def("__str__", &ClassType::type::DebugString);
 }

@ -201,8 +222,6 @@ All parameter, weight, gradient are variables in Paddle.
    return OpRegistry::CreateOp(desc);
  });

-  operator_base.def_static("support_gpu", &OpRegistry::SupportGPU);
-
  operator_base.def("backward",
                    [](const OperatorBase &forwardOp,
                       const std::unordered_set<std::string> &no_grad_vars) {
--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class KmaxSeqScoreLayer : public Layer {
+private:
+  MatrixPtr scores_;
+  size_t beamSize_;
+  void kmaxScorePerSeq(const real* score,
+                       real* sortedRes,
+                       const ICpuGpuVectorPtr seqStartPos);
+
+public:
+  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
+
+bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  CHECK_EQ(1U, inputLayers_.size());
+
+  beamSize_ = config_.beam_size();
+  CHECK_GE(beamSize_, 1U);
+
+  setNeedSequenceInfo(false);
+  setNeedGradient(false);
+  return ret;
+}
+
+void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
+                                        real* sortedIds,
+                                        const ICpuGpuVectorPtr seqStartPos) {
+  int* starts = seqStartPos->getMutableData(false);
+  std::vector<real> indices;
+  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
+    int seqLen = starts[i + 1] - starts[i];
+    int k = std::min(static_cast<int>(beamSize_), seqLen);
+
+    indices.resize(seqLen, 0);
+    std::iota(begin(indices), end(indices), 0.);
+    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
+    std::partial_sort(
+        begin(indices),
+        begin(indices) + k,
+        end(indices),
+        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
+    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
+  }
+}
+
+void KmaxSeqScoreLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const MatrixPtr inputScore = getInputValue(0);
+
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "input of " << getName()
+      << " must be a sequence or a nested sequence.";
+  CHECK_EQ(input.value->getWidth(), 1UL)
+      << "input of " << getName()
+      << " is score over a sequence or a nested sequence, so its width "
+      << " must be 1.";
+
+  if (useGpu_) {
+    // this Layer runs only in CPU, if the model is runing on GPU,
+    // then copy the input to this layer from GPU to CPU.
+    Matrix::resizeOrCreate(scores_,
+                           inputScore->getHeight(),
+                           1,
+                           false /* trans */,
+                           false /* useGpu */);
+    scores_->copyFrom(*inputScore);
+  } else {
+    scores_ = inputScore;
+  }
+
+  Matrix::resizeOrCreate(
+      output_.value,
+      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
+      beamSize_,
+      false,
+      false);
+  output_.value->one();
+  output_.value->mulScalar(-1.);
+
+  kmaxScorePerSeq(scores_->getData(),
+                  output_.value->getData(),
+                  input.hasSubseq() ? input.subSequenceStartPositions
+                                    : input.sequenceStartPositions);
+}
+
+void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SubNestedSequenceLayer : public Layer {
+public:
+  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  /*
+   * This functions generates the indices of rows in a batch according to the
+   * indices of selected sub-sequence in each sequence.
+   *
+   * Examples:
+   * selectedIndices:
+   *   [
+   *     [0, 1, -1],
+   *     [0, 1, 2],
+   *     [0, -1, -1],
+   *     [0, 2, 3],
+   *   ]
+   * inputSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   *
+   * ths output is saved to private member rowIndice_;
+   * [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
+   *  16,17,18,19,20,21,22,23,24,25,26,27]
+   */
+
+  void calSelectedCols(const MatrixPtr selectedIndices,
+                       const std::vector<std::vector<int>>& inputSeqInfo);
+
+  // if the second input of this layer is on GPU memory, copy it to CPU memory.
+  MatrixPtr selIdsCpu_;
+
+  // reorganized sequenceStartPositions and subSequenceStartPositions
+  // into a 2d vector to facilitate the sequence selection process.
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+
+  // the final selected row indices in a batch,
+  // rowIdx_ and selectedRows_ actually share a same memory.
+  IVectorPtr rowIndice_;
+  std::vector<int> selectedRows_;
+};
+
+REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
+
+bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(2U, inputLayers_.size());
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubNestedSequenceLayer::calSelectedCols(
+    const MatrixPtr selectedIndices,
+    const std::vector<std::vector<int>>& inputSeqInfo) {
+  selectedRows_.clear();
+
+  std::vector<int> outSeqStartInfo(1, 0);
+  std::vector<int> outSubSeqStartInfo(1, 0);
+
+  size_t seqNum = selectedIndices->getHeight();
+  size_t beamSize = selectedIndices->getWidth();
+  for (size_t i = 0; i < seqNum; ++i) {
+    for (size_t j = 0; j < beamSize; ++j) {
+      if (selectedIndices->getElement(i, j) == -1.) break;
+      int selSubSeqIdx = selectedIndices->getElement(i, j);
+      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
+
+      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
+                         inputSeqInfoVec_[i][selSubSeqIdx];
+      for (size_t k = 0; k < subSeqLen; ++k)
+        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
+      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
+    }
+    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
+
+  ICpuGpuVector::resizeOrCreate(
+      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
+  output_.subSequenceStartPositions->copyFrom(
+      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
+}
+
+void SubNestedSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
+                              << "must be a nested sequence.";
+  const MatrixPtr selectedIndices = getInputValue(1);
+  CHECK_EQ(inputSeq.getNumSequences(), selectedIndices->getHeight());
+
+  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
+    /*
+     * Currently, the second input for this layer is generated by
+     * kmax_sequence_score_layer whose output is always stored on CPU,
+     * or a data_layer which canbe on GPU.
+     *
+     * If the second input is on GPU, copy it to CPU memory, because this
+     * input always uses very few memory, and operations related to it are
+     * all logic control, not computations.
+     */
+    Matrix::resizeOrCreate(selIdsCpu_,
+                           selectedIndices->getHeight(),
+                           selectedIndices->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    selIdsCpu_->copyFrom(*selectedIndices);
+  } else {
+    selIdsCpu_ = selectedIndices;
+  }
+
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  calSelectedCols(selIdsCpu_, inputSeqInfoVec_);
+
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inputSeqGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
+}
+
+}  // namespace paddle
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@ -66,6 +66,16 @@ add_unittest_without_exec(test_BatchNorm

 add_test(NAME test_BatchNorm
    COMMAND test_BatchNorm)
+
+
+################# test_KmaxSeqScore #######################
+add_unittest_without_exec(test_KmaxSeqScore
+    test_KmaxSeqScore.cpp
+    LayerGradUtil.cpp)
+
+add_test(NAME test_KmaxSeqScore
+    COMMAND test_KmaxSeqScore)
+
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
    test_Evaluator.cpp)
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@ -0,0 +1,160 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+vector<int> randSampling(int range, int n) {
+  CHECK_GE(range, n);
+  vector<int> num(range);
+  iota(begin(num), end(num), 0);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  return num;
+}
+
+void genRandomSeqInfo(vector<int>& seqStartPosition,
+                      vector<int>& subSeqStartPosition) {
+  const int maxSeqNum = 100;
+  // generate random start position information
+  int seqNum = 1 + (rand() % maxSeqNum);
+  seqStartPosition.resize(seqNum + 1, 0);
+  subSeqStartPosition.resize(1, 0);
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqLen = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqLen; ++j)
+      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
+    seqStartPosition[i + 1] = subSeqStartPosition.back();
+  }
+}
+
+void genRandomGroundTruth(real* values,
+                          vector<vector<int>>& groundTruth,
+                          vector<int>& startPos,
+                          size_t beamSize) {
+  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
+  for (size_t i = 0; i < startPos.size() - 1; ++i) {
+    int seqLen = startPos[i + 1] - startPos[i];
+    vector<int> pos =
+        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
+    for (size_t j = 0; j < pos.size(); ++j) {
+      groundTruth[i][j] = pos[j];
+      values[startPos[i] + pos[j]] = 1.;
+    }
+  }
+}
+
+void checkLayerOut(vector<vector<int>> groundTruth,
+                   real* layerOut,
+                   size_t beamSize) {
+  for (size_t i = 0; i < groundTruth.size(); ++i) {
+    int begPos = i * beamSize;
+    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
+    sort(begin(tmp), end(tmp));
+    sort(begin(groundTruth[i]), end(groundTruth[i]));
+    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
+  }
+}
+
+TEST(Layer, kmaxSeqScoreLayer) {
+  const size_t maxBeamSize = 100;
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  vector<int> seqStartPosition;
+  vector<int> subSeqStartPosition;
+  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
+  MatrixPtr inValue =
+      Matrix::create(subSeqStartPosition.back(), 1, false, false);
+
+  for (auto hasSubseq : {false, true}) {
+    vector<vector<int>> groundTruth;
+    inValue->randomizeUniform();
+    genRandomGroundTruth(inValue->getData(),
+                         groundTruth,
+                         hasSubseq ? subSeqStartPosition : seqStartPosition,
+                         beamSize);
+
+    for (auto useGpu : {false, true}) {
+      TestConfig config;
+      config.layerConfig.set_type("kmax_seq_score");
+      config.layerConfig.set_beam_size(beamSize);
+
+      if (hasSubseq) {
+        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                    "scores",
+                                    inValue,
+                                    seqStartPosition,
+                                    subSeqStartPosition});
+      } else {
+        config.inputDefs.push_back(
+            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
+      }
+      config.layerConfig.add_inputs();
+
+      // data layer initialize
+      std::vector<DataLayerPtr> dataLayers;
+      LayerMap layerMap;
+      vector<Argument> datas;
+      initDataLayer(
+          config,
+          &dataLayers,
+          &datas,
+          &layerMap,
+          "kmax_seq_score",
+          100 /* actually this parameter is unused in self-defined input*/,
+          false,
+          useGpu);
+      // test layer initialize
+      std::vector<ParameterPtr> parameters;
+      LayerPtr kmaxSeqScoreLayer;
+      FLAGS_use_gpu = useGpu;
+      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
+      kmaxSeqScoreLayer->forward(PASS_TRAIN);
+
+      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
+      CHECK_EQ(outValue->getHeight(),
+               hasSubseq ? subSeqStartPosition.size() - 1
+                         : seqStartPosition.size() - 1);
+      CHECK_EQ(outValue->getWidth(), beamSize);
+      checkLayerOut(groundTruth, outValue->getData(), beamSize);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand((size_t)(time(NULL)));
+  return RUN_ALL_TESTS();
+}
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -1899,6 +1899,84 @@ TEST(Layer, CropLayer) {
  }
 }

+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+TEST(Layer, SubNestedSequenceLayer) {
+  // layer size is not crutial for this layer,
+  // so use a small layer size in unittest
+  const int layerSize = 4;
+
+  const int maxSeqNum = 50;
+  const int maxSeqLen = 50;
+  const int maxBeamSize = 32;
+
+  srand((size_t)(time(NULL)));
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  TestConfig config;
+  config.layerConfig.set_type("sub_nested_seq");
+  config.layerConfig.set_name("sub_nested_seq_layer");
+  config.layerConfig.set_size(layerSize);
+
+  int seqNum = 1 + (rand() % maxSeqNum);
+
+  // sequence information for the first input, it is a nested sequence
+  vector<int> seqStartPos(seqNum + 1, 0);
+  vector<int> subSeqStartPos(1, 0);
+
+  // selected indices
+  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
+  selectedIndices->one();
+  selectedIndices->mulScalar(-1.);
+  real* indicesData = selectedIndices->getData();
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqNum; ++j) {
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % maxSeqLen)));
+    }
+    vector<real> selSeqs =
+        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
+    memcpy(indicesData + (i * beamSize),
+           selSeqs.data(),
+           selSeqs.size() * sizeof(real));
+    seqStartPos[i + 1] = subSeqStartPos.back();
+  }
+
+  MatrixPtr seqInputPtr =
+      Matrix::create(seqStartPos.back(), layerSize, false, false);
+  seqInputPtr->randomizeUniform();
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                              "nested_seq_input",
+                              seqInputPtr,
+                              seqStartPos,
+                              subSeqStartPos});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sub_nested_seq",
+                  /* batchSize */ seqNum,
+                  /* trans */ false,
+                  /* useGpu*/ useGpu,
+                  /* useWeight */ false);
+  }
+}
+
 TEST(Layer, ClipLayer) {
  const size_t batchSize = 128;
  const size_t size = 512;
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -67,3 +67,5 @@ op_library(fc_op
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
    DEPS op_desc tensor op_registry operator net_op)
 cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
+op_library(uniform_random_op
+        SRCS uniform_random_op.cc uniform_random_op.cu)
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@ -70,9 +70,10 @@ OnehotCrossEntropy Operator.
 namespace ops = paddle::operators;
 REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
            ops::OnehotCrossEntropyOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    onehot_cross_entropy,
-    ops::OnehotCrossEntropyOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_GRADIENT_OP(onehot_cross_entropy, onehot_cross_entropy_grad,
+                     ops::OnehotCrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(
    onehot_cross_entropy_grad,
    ops::OnehotCrossEntropyGradientOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@ -59,6 +59,15 @@ class NetOp : public framework::OperatorBase {
    }
  }

+  bool SupportGPU() const override {
+    for (auto& op : ops_) {
+      if (!op->SupportGPU()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
  /**
   * @brief Add an operator by ptr
   */
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <random>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class CPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.op_.GetAttr<float>("min")),
+        static_cast<T>(context.op_.GetAttr<float>("max")));
+    for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class UniformRandomOp : public framework::OperatorWithKernel {
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE(GetAttr<float>("min") < GetAttr<float>("max"),
+                   "uniform_random's min must less then max");
+    auto* tensor = ctx.Output<framework::Tensor>(0);
+    auto dims = GetAttr<std::vector<int>>("dims");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+
+class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UniformRandomOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "The output tensor of uniform random op");
+    AddComment(R"DOC(Uniform random operator.
+
+Used to initialize tensor with uniform random generator.
+)DOC");
+    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
+    AddAttr<float>("min", "Minimum value of uniform random").SetDefault(-1.0f);
+    AddAttr<float>("max", "Maximun value of uniform random").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of uniform random. "
+                 "0 means generate a seed by system")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(uniform_random, paddle::operators::UniformRandomOp,
+            paddle::operators::UniformRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(uniform_random,
+                       paddle::operators::CPUUniformRandomKernel<float>);
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+
+  __host__ __device__ UniformGenerator(T min, T max, int seed)
+      : min_(min), max_(max), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class GPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    T min = static_cast<T>(context.op_.GetAttr<float>("min"));
+    T max = static_cast<T>(context.op_.GetAttr<float>("max"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    ssize_t N = framework::product(tensor->dims());
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_GPU_KERNEL(uniform_random,
+                       paddle::operators::GPUUniformRandomKernel<float>);
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@ -666,4 +666,24 @@ void Argument::subArgFrom(const Argument& input,
  }
 }

+void Argument::reorganizeSeqInfo(
+    const ICpuGpuVectorPtr seqStartPos,
+    const ICpuGpuVectorPtr subSeqStartPos,
+    std::vector<std::vector<int>>& reorganizedSeqInfo) {
+  int* seqStarts = seqStartPos->getMutableData(false);
+  int* subSeqStarts = subSeqStartPos->getMutableData(false);
+
+  int seqNum = seqStartPos->getSize() - 1;
+  reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+  int seqIdx = 0;
+  for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+    reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+      seqIdx++;
+      if (seqIdx == seqNum) return;
+      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    }
+  }
+}
+
 }  // namespace paddle
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@ -317,6 +317,30 @@ struct Argument {
   */
  void printValueString(std::ostream& stream,
                        const std::string& prefix = "") const;
+
+  /**
+   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
+   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
+   *
+   * @param seqStartPos: sequenceStartPositions of an Argument.
+   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
+   * @param the reorganized sequence start position information.
+   *
+   * Examples:
+   * seqStartPos: [0, 4, 15, 20, 28]
+   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
+   * reorganizedSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   */
+  static void reorganizeSeqInfo(
+      const ICpuGpuVectorPtr seqStartPos,
+      const ICpuGpuVectorPtr subSeqStartPos,
+      std::vector<std::vector<int>>& reorganizedSeqInfo);
 };

 }  // namespace paddle
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@ -5,15 +5,9 @@ set -e
 mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build

-# Compile paddle binaries first
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF
-
-mkdir output
-make -j `nproc`
-find .. -name '*whl' | xargs pip install  # install all wheels.
-rm -rf *
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn

 # check websites for broken links
@ -35,6 +29,7 @@ TARGET_BRANCH="gh-pages"
 SOURCE_BRANCH="master"

 # Clone the repo to output directory
+mkdir output
 git clone $REPO output
 cd output

--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@ -17,7 +17,7 @@ foreach(filename ${proto_filenames})
            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
            ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto"
            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-            DEPENDS ${ABS_FIL} ${external_project_dependencies})
+            DEPENDS ${ABS_FIL} protoc)
 endforeach()

 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@ -2657,6 +2657,31 @@ class SubSequenceLayer(LayerBase):
        self.create_bias_parameter(bias, size)


+@config_layer('sub_nested_seq')
+class SubNestedSequenceLayer(LayerBase):
+    def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
+        if isinstance(inputs, list):
+            assert len(inputs) == 1, ('the first input of sub_nested_seq '
+                                      'layer is a single nested sequence.')
+            inputs = inputs[0]
+        if isinstance(selected_indices, list):
+            assert len(selected_indices) == 1, (
+                'the second input of '
+                'sub_nested_seq layer is a single layer which is a '
+                'set of selected indices.')
+            selected_indices = selected_indices[0]
+
+        super(SubNestedSequenceLayer, self).__init__(
+            name,
+            'sub_nested_seq',
+            0,
+            inputs=[inputs, selected_indices],
+            **xargs)
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+
+
@config_layer('out_prod')
 class OuterProdLayer(LayerBase):
    def __init__(self, name, inputs, device=None):
@ -3223,6 +3248,16 @@ class CTCLayer(LayerBase):
        config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')


+@config_layer('kmax_seq_score')
+class KmaxSeqScoreLayer(LayerBase):
+    def __init__(self, name, inputs, beam_size, **xargs):
+        super(KmaxSeqScoreLayer, self).__init__(
+            name, 'kmax_seq_score', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1, 'KmaxSeqScoreLayer has only one input.')
+        self.config.beam_size = beam_size
+
+
@config_layer('warp_ctc')
 class WarpCTCLayer(LayerBase):
    def __init__(self,
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@ -129,8 +129,10 @@ __all__ = [
    'prelu_layer',
    'gated_unit_layer',
    'crop_layer',
+    'sub_nested_seq_layer',
    'clip_layer',
    'slice_projection',
+    'kmax_sequence_score_layer',
 ]


@ -224,8 +226,11 @@ class LayerType(object):

    PRELU = 'prelu'
    CROP_LAYER = 'crop'
+    SUB_NESTED_SEQ = 'sub_nested_seq'
    CLIP_LAYER = 'clip'

+    KMAX_SEQ_SCORE = 'kmax_seq_score'
+
    @staticmethod
    def is_layer_type(type_name):
        """
@ -6088,6 +6093,53 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
        size=l.config.size)


+@wrap_name_default()
+@layer_support()
+def sub_nested_seq_layer(input, selected_indices, name=None):
+    """
+    The sub_nested_seq_layer accepts two inputs: the first one is a nested
+    sequence; the second one is a set of selceted indices in the nested sequence.
+
+    Then sub_nest_seq_layer trims the first nested sequence input according
+    to the selected indices to form a new output. This layer is useful in
+    beam training.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        sub_nest_seq = sub_nested_seq_layer(input=[data, selected_indices])
+
+
+    :param input: A nested sequence.
+    :type input: LayerOutput
+    :param selected_indices: a set of sequence indices in the nested sequence.
+    :type input: LayerOutput
+    :param name: name of this layer.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+    assert isinstance(selected_indices, LayerOutput), (
+        'The second input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+
+    l = Layer(
+        inputs=input.name,
+        selected_indices=selected_indices.name,
+        name=name,
+        type=LayerType.SUB_NESTED_SEQ)
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.SUB_NESTED_SEQ,
+        parents=input,
+        size=l.config.size)
+
+
@wrap_name_default("clip")
 def clip_layer(input, min, max, name=None):
    """
@ -6109,7 +6161,8 @@ def clip_layer(input, min, max, name=None):
    :type min: double
    :param max: The upper threshold for clipping.
    :type max: double
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
    Layer(
        name=name,
@ -6119,3 +6172,41 @@ def clip_layer(input, min, max, name=None):
        max=max)
    return LayerOutput(
        name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def kmax_sequence_score_layer(input, name=None, beam_size=1):
+    """
+    This layer accepts one input which are scores over a sequence or a nested
+    sequence, and returns indices of beam_size sequences with highest scores.
+
+    .. code-block:: python
+
+        kmax_indices = kmax_sequence_score_layer(input=input_layer, beam_size)
+
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer. It stores scores over a sequence or a nested
+        sequence and its size must be 1.
+    :type input: LayerOutput.
+    :param beam_size: squence indices with top beam_size scores are returned.
+    :type beam_size: double
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput), ("kmax_sequence_score_layer "
+                                            "accepts only one input.")
+    assert input.size == 1, (
+        "input of kmax_sequence_score_layer is a score"
+        "over a sequence or a nested sequence, so its width must be 1.")
+
+    Layer(
+        name=name,
+        type=LayerType.KMAX_SEQ_SCORE,
+        inputs=[input.name],
+        beam_size=beam_size)
+
+    return LayerOutput(
+        name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@ -7,6 +7,7 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer)
+test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
+test_kmax_seq_socre_layer test_seq_select_layers)

 export whole_configs=(test_split_datasource)
--- a/Show More
+++ b/Show More