Merge remote-tracking branch 'upstream/master'

9 years ago · 92ca98d5f5
parent 1f8c21978d aeb2d848d0
commit 92ca98d5f5
49 changed files with 1006 additions and 314 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -2,6 +2,9 @@ language: cpp
 cache: ccache
 sudo: required
 dist: trusty
+env:
+  - JOB=DOCS
+  - JOB=BUILD_AND_TEST
 addons:
  apt:
    packages:
@ -16,6 +19,7 @@ addons:
      - python2.7-dev
      - m4
      - libprotobuf-dev
+      - doxygen
      - protobuf-compiler
      - python-protobuf
      - python-numpy
@ -24,12 +28,10 @@ addons:
      - libgflags-dev
      - libgtest-dev
 before_install:
-  - pip install wheel protobuf
+  - pip install wheel protobuf sphinx breathe recommonmark
  - sudo paddle/scripts/travis/before_install.sh
 script:
-  - paddle/scripts/travis/build.sh
-  - paddle/scripts/travis/unittest.sh
-  - paddle/scripts/travis/make_install.sh
+  - paddle/scripts/travis/main.sh
 notifications:
  email:
    on_success: change
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@ -25,7 +25,7 @@ repo or just head straight to the command line:
 
 ```shell
 # Clone your fork to your local machine
-git clone git@github.com:USERNAME/paddle.git
+git clone git@github.com:USERNAME/Paddle.git
 ```
 Then you can start to develop. 

@ -52,7 +52,7 @@ To do this, you'll need to add a remote at first:
 # see the current configured remote repository
 git remote -v
 # add upstream repository
-git remote add upstream https://github.com/paddle/paddle.git
+git remote add upstream https://github.com/baidu/Paddle.git
 # verify the new upstream
 git remote -v
 ```
--- a/doc/build/index.rst
+++ b/doc/build/index.rst
@ -9,6 +9,7 @@ Install PaddlePaddle
    :glob:

    install_*
+    internal/install_from_jumbo.md

 Build from Source
 -----------------
--- a/doc/cluster/index.rst
+++ b/doc/cluster/index.rst
@ -5,3 +5,4 @@ Cluster Train
  :glob:

  opensource/cluster_train.md
+  internal/index.md
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@ -245,10 +245,10 @@ addto_layer
    :members: addto_layer
    :noindex:

-convex_comb_layer
+linear_comb_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
-    :members: convex_comb_layer
+    :members: linear_comb_layer
    :noindex:

 interpolation_layer
@ -280,7 +280,13 @@ tensor_layer
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: tensor_layer
    :noindex:
-    
+
+cos_sim
+-------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: cos_sim
+    :noindex:
+
 trans_layer
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@ -341,12 +347,6 @@ rank_cost
    :members: rank_cost
    :noindex:

-cos_sim
-------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: cos_sim
-    :noindex:
-
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
--- a/doc_cn/build_and_install/index.rst
+++ b/doc_cn/build_and_install/index.rst
@ -9,7 +9,11 @@ Note: The intallation packages are still in pre-release state and your experienc

 .. toctree::
   :maxdepth: 1
+   :glob:
   
+   源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst>
+   使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst>
+   从源码编译安装(对内)  <../build/internal/build_from_source_zh_cn.rst>
   install/docker_install.rst 
   install/ubuntu_install.rst
   cmake/index.rst
--- a/doc_cn/cluster/index.rst
+++ b/doc_cn/cluster/index.rst
@ -0,0 +1,11 @@
+集群训练
+========
+
+* `集群训练 <../../doc/cluster/index.html>`_
+
+.. toctree::
+    :maxdepth: 2
+    :glob:
+
+    集群训练(对内) <internal/index.md>
+
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@ -8,7 +8,7 @@ PaddlePaddle文档
 * `用户接口 <ui/index.html>`_
 * `使用示例 <demo/index.html>`_
 * `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_
-* `集群训练 <../doc/cluster/index.html>`_
+* `集群训练 <cluster/index.html>`_

 开发指南
 --------
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@ -150,7 +150,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)


 // APIs available after R4:
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
  __macro(cudnnBatchNormalizationForwardTraining)            \
  __macro(cudnnBatchNormalizationForwardInference)           \
@ -999,7 +999,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
                                    double epsilon,
                                    real *savedMean,
                                    real *savedVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
  if ((NULL != runningMean && NULL == runningInvVar) ||
      (NULL == runningMean && NULL != runningInvVar)) {
    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
@ -1024,7 +1024,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,

  CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
             << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@ -1039,7 +1039,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
                                    real *estimatedMean,
                                    real *estimatedInvVar,
                                    double epsilon) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
@ -1053,7 +1053,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,

  CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
             << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@ -1071,7 +1071,7 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
                            double epsilon,
                            real *savedMean,
                            real *savedInvVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
  if ((NULL != savedMean && NULL == savedInvVar) ||
      (NULL == savedMean && NULL != savedInvVar)) {
    LOG(FATAL) << "savedMean and savedVar can be NULL "
@ -1087,16 +1087,14 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
              t_resource.cudnn_handle, mode, &alpha, &beta,
-#if CUDNN_VERSION >= 5000
              &alpha, &beta,
-#endif
              xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
              bnDesc, scale, scaleGrad, biasGrad, epsilon,
              savedMean, savedInvVar));

  CHECK_SYNC("hl_batch_norm_backward failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
             << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@ -19,6 +19,7 @@ limitations under the License. */
 #include "hl_matrix_apply.cuh"
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
+#include "hl_device_functions.cuh"

 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@ -194,8 +194,8 @@ public:
  virtual real evalImp(std::vector<Argument>& arguments) {
    CHECK_EQ(arguments.size(), (size_t)2);
    Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false);
-    label.resizeAndCopyFrom(arguments[1], false);
+    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
+    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    CHECK(label.sequenceStartPositions);
    CHECK(label.ids);
@ -207,7 +207,7 @@ public:
      real err = 0;
      err = editDistance(
          output.value->getData() + output.value->getWidth() * outputStarts[i],
-          output.value->getHeight(), output.value->getWidth(),
+          outputStarts[i+1] - outputStarts[i], output.value->getWidth(),
          label.ids->getData() + labelStarts[i],
          labelStarts[i + 1] - labelStarts[i]);

@ -224,6 +224,9 @@ public:
    for (const std::string& name : config_.input_layers()) {
      arguments.push_back(nn.getLayer(name)->getOutput());
    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
    numSequences_ += arguments[1].getNumSequences();
  }

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
 #pragma once

 #include "GradientMachine.h"
@ -101,7 +100,7 @@ public:
   * Return true if this prefix or candidate is expected to be dropped.
   */
  typedef std::function<bool(int seqId, const std::vector<int>&,
-      const std::vector<real>&)> DropCallback;
+                             const std::vector<real>&)> DropCallback;

  /**
    * @brief NormOrDropNodeCallback
@ -117,7 +116,7 @@ public:
    * The fourth parameter is the probability of the whole path.
    */
  typedef std::function<void(int seqId, const std::vector<int>&,
-      std::vector<real>&, real*)> NormOrDropNodeCallback;
+                             std::vector<real>&, real*)> NormOrDropNodeCallback;

  /**
   * @brief Register beam search control callbacks. Used for prediction.
@ -192,7 +191,7 @@ public:

    int machineId;  // index of sample in frame
    int topIndex;   // index of MaxIdLayer output in one sample
-    int seqId;  // index of sequence in batch generation
+    int seqId;      // index of sequence in batch generation
    std::vector<int> machineIdVec;

    /**
@ -206,7 +205,10 @@ public:
    /**
     * @brief Path default ctor, first logProb is 0.
     */
-    Path() { logProb = 0; seqId = 0; }
+    Path() {
+      logProb = 0;
+      seqId = 0;
+    }
    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }

    /**
@ -319,21 +321,33 @@ protected:
  };
  std::vector<MemoryFrameLine> memoryFrameLines_;

-  // All inFrameLines and outFrameLines have the same element as follows.
+  // Each inFrameLines(inlinks) has its own info(elements) below,
+  // and all outFrameLines(outlinks) share the info with one inFrameLine,
+  // which is assigned by targetInfoInlinkId_.
  struct Info {
    IVectorPtr allIds;         // scattered id of realLayer
    std::vector<int> idIndex;  // index of allIds
    ICpuGpuVectorPtr
-        sequenceStartPositions;      // scattered sequenceStartPositions
+        sequenceStartPositions;         // scattered sequenceStartPositions
    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
  };
-  Info info_;
+  std::vector<Info> info_;
+
+  // numSeqs_[i] is the number sequences which is longer than i (for sequence
+  // data) or has more than i subsequences (for subsequence data)
+  std::vector<int> numSeqs_;

-  // if no subSeq, tuple of (seqLength, seqStart, seqIndex, seqIndex)
-  // else, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
-  std::vector<std::tuple<int, int, int, int>> seqLengthAndStart_;
+  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;

-  void createInFrameInfo(const Argument& input, PassType passType);
+  // the id of inlink which share info with outlinks
+  int targetInfoInlinkId_;
+
+  /* create scattered id infomation for all realLayer of inFrameLines one time.
+  *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+  *  for all realLayer of inFrameLines one time.
+  */
+  void createInFrameInfo(int inlinks_id, const Argument& input,
+                         PassType passType);

  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
                             PassType passType);
@ -363,6 +377,9 @@ protected:

  NeuralNetwork* rootNetwork_;
  bool reversed_;
+
+  // if hasSubseq: max number of sentences(subseq)in batchsize samples
+  // else: max number of tokens in batchsize samples(sentences)
  int maxSequenceLength_;
  bool useGpu_;
  bool stopBeamSearch_;
@ -415,7 +432,7 @@ private:
   * @param machineIdVec : select a row of output matrix in each frame
   * that the generation process expanded.
   */
-  void createDataOutlink(std::vector<int> & machineIdVec);
+  void createDataOutlink(std::vector<int>& machineIdVec);

  /*
   * @brief used in beam search, connect previous frame to form recurrent link
--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@ -49,8 +49,10 @@ void CTCLayer::forward(PassType passType) {
  Layer::forward(passType);
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
  } else {
    forwardImp(getInput(0), getInput(1));
@ -92,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) {
  if (useGpu_) {
    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
  } else {
    backwardImp(callback, getInput(0), getInput(1));
  }
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@ -248,7 +248,7 @@ void ConvOperator::forward() {
  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
  checkFilterSize(ins_[1]->value);
  Matrix::resizeOrCreate(out_->value, batchSize,
-                         outputH_ * outputW_ * numFilters_);
+                         outputH_ * outputW_ * numFilters_, false, useGpu_);
  {
    AsyncGpuBlock block;
    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@ -21,18 +21,20 @@ limitations under the License. */
 namespace paddle {

 /**
- * @brief A layer for convex weighted average of vectors,
+ * @brief A layer for weighted sum of vectors,
 * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
 * TRANSLATE
- * - Input: the first input contains the convex weights (batchSize x weightDim),
- *          and the shape of second input is (batchSize x (weightdim*dataDim)).
- * - Output: the shape of output is (batchSize x dataDim).
+ * - Input: the the size of the first input is weightDim,
+ *          and the size of the second input is weightdim * dataDim.
+ * - Output: the sizeof the output is dataDim
 * \f[
- *   out[i][j] = \sum_{j}(in0(i, j) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+ *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
+ *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
 * \f]
+ * Note that the above computation is for one sample. Multiple samples are
+ * processed in one batch.
 *
- * The config file api is convex_comb_layer.
+ * The config file api is linear_comb_layer.
 */
 class ConvexCombinationLayer : public Layer {
 protected:
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@ -48,7 +48,7 @@ void CosSimLayer::forward(PassType passType) {
    REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
    MatrixPtr prevOut1 = getInputValue(0);
    MatrixPtr prevOut2 = getInputValue(1);
-    outV->cosSim(*prevOut1, *prevOut2, kCosSimScale_);
+    outV->cosSim(*prevOut1, *prevOut2, config_.cos_scale());
  }
 }

@ -59,7 +59,7 @@ void CosSimLayer::backward(const UpdateCallback& callback) {

    outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0),
                           *getInputValue(1), *getInputGrad(0),
-                           *getInputGrad(1), kCosSimScale_);
+                           *getInputGrad(1), config_.cos_scale());
  }
 }

--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@ -36,7 +36,7 @@ namespace paddle {
 class CosSimLayer : public Layer {
 public:
  explicit CosSimLayer(const LayerConfig& config)
-      : Layer(config), kCosSimScale_(5.0f) {}
+      : Layer(config) {}

  ~CosSimLayer() {}

@ -44,8 +44,6 @@ public:

  void forward(PassType passType);
  void backward(const UpdateCallback& callback = nullptr);
-
-  const real kCosSimScale_;
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@ -509,8 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
                               Matrix &cost) {
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
  forwardImpIn(output, label, cost);
 }
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@ -115,29 +115,11 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
    create(tmpBiasGrad_, 1, channels_, &betaGrad);
  }

-  // because of the different api of cudnn v4 and v5.
-  if (hl_get_cudnn_lib_version() < 5000) {
-    if (weight_->getWGrad()) {
-      create(tmpWGrad_, 1, channels_, &gammaGrad);
-    }
-    if (biases_ && biases_->getWGrad()) {
-      create(tmpBiasGrad_, 1, channels_, &betaGrad);
-    }
-  }
-
  hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad,
                         ioDesc_, inGrad, bnParamDesc_,
                         gamma, gammaGrad, betaGrad,
                         EPS, savedMean, savedInvVar);

-  // because of the different api of cudnn v4 and v5.
-  if (hl_get_cudnn_lib_version() < 5000) {
-    if (weight_->getWGrad() && biases_->getWGrad()) {
-      weight_->getWGrad()->add(*tmpWGrad_);
-      biases_->getWGrad()->add(*tmpBiasGrad_);
-    }
-  }
-
  {
    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
    biases_->getParameterPtr()->incUpdate(callback);
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@ -0,0 +1,58 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class PrintLayer : public Layer {
+public:
+  explicit PrintLayer(const LayerConfig& config)
+      : Layer(config) {}
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback) {}
+};
+
+void PrintLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const auto& argu = getInput(i);
+    const std::string& name = inputLayers_[i]->getName();
+    if (argu.value) {
+      std::ostringstream os;
+      argu.value->print(os);
+      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
+    }
+    if (argu.ids) {
+      std::ostringstream os;
+      argu.ids->print(os, argu.ids->getSize());
+      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
+    }
+    if (auto startPos = argu.sequenceStartPositions) {
+      std::ostringstream os;
+      startPos->getVector(false)->print(os, startPos->getSize());
+      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
+    }
+    if (auto subStartPos = argu.subSequenceStartPositions) {
+      std::ostringstream os;
+      subStartPos->getVector(false)->print(os, subStartPos->getSize());
+      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
+                << os.str();
+    }
+  }
+}
+
+REGISTER_LAYER(print, PrintLayer);
+
+}  // namespace paddle
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@ -52,8 +52,10 @@ public:
    Layer::forward(passType);
    if (useGpu_) {
      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+        tmpCpuInput_[i].resizeAndCopyFrom(
+            getInput(i), false, HPPL_STREAM_DEFAULT);
      }
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
      forwardImp(tmpCpuInput_[0]);
    } else {
      forwardImp(getInput(0));
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
    testLayer->forward(PASS_TEST);
    Argument out;
    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    if (batchOut.value) {
      size_t dim = batchOut.value->getWidth();
      ASSERT_TRUE((bool)out.value);
@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
    testLayer->forward(PASS_TEST);
    Argument out;
    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    if (batchOut.value) {
      size_t dim = batchOut.value->getWidth();
      ASSERT_TRUE((bool)out.value);
--- a/paddle/gserver/tests/Sequence/dummy.list
+++ b/paddle/gserver/tests/Sequence/dummy.list
@ -0,0 +1 @@
+dummy_file_no_use
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@ -0,0 +1,35 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(2)])
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(2)])
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
--- a/Show More
+++ b/Show More