Correctly handle memory in RecurrentGradientMachine for hirarchical RNN

Change-Id: I8e0a8ea6fc2760652d9c76440a539c90860062d3
9 years ago · 9a9de9240d
parent 699d5f2638
commit 9a9de9240d
9 changed files with 207 additions and 9 deletions
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@ -519,7 +519,6 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
      gatherAgent->addRealLayer(outFrameLine.frames[i]);
    }
    // connect memory links
    // Adopt info_[0].idIndex because seq which has_subseq=True
    // doesn't support Memory with !hasSubseq bootlayer;
@ -529,7 +528,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
      NeuralNetwork::connect(
          memoryFrameLine.agents[i],
          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          idSize /*height of agent*/);
+          numSeqs_[i] /*height of agent*/);
    }
  }
@ -622,6 +621,8 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
  // numSequences: # samples(sequences) in a batch
  size_t numSequences = input.getNumSequences();
  std::vector<int> allIds;
  numSeqs_.clear();
  Info* inlink_info = &info_[inlinks_id];
  inlink_info->idIndex.clear();
  inlink_info->idIndex.push_back(0);  // first idIndex = 0
@ -634,10 +635,12 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
    // maxSequenceLength_: max number of sentences(subseq) in allsamples
    for (int i = 0; i < maxSequenceLength_; ++i) {
      sequenceStartPositions.push_back(0);            // first element = 0
      int numSeqs = 0;
      for (size_t j = 0; j < numSubSequences; ++j) {  // for each sentence
        // seqLengthAndStart_[inlinks_id][j]:
        // a 4-tuple including <subseqlen, subseqstart, seqid, subseqid>
        if (std::get<3>(seqLengthAndStart_[inlinks_id][j]) == i) {
          ++numSeqs;
          // subseqstart: the cpuSubSequenceStartPositions of this subseq
          int subSeqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]);
          int subSeqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]);
@ -650,6 +653,7 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
      }
      inlink_info->idIndex.push_back(allIds.size());
      inlink_info->seqStartPosIndex.push_back(sequenceStartPositions.size());
      numSeqs_.push_back(numSeqs);
    }
    // inFrameLine create sequenceStartPositions one time
    CHECK_EQ(sequenceStartPositions.size(),
@ -659,16 +663,19 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
    createSeqPos(sequenceStartPositions, &inlink_info->sequenceStartPositions);
  } else {  // for scatterAgentLayer
    for (int i = 0; i < maxSequenceLength_; ++i) {
      int numSeqs = 0;
      for (size_t j = 0; j < numSequences; ++j) {
        int seqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]);
        if (i >= seqLength) {
          break;
        }
        ++numSeqs;
        int seqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]);
        allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
                                   : (seqStart + i));
      }
      inlink_info->idIndex.push_back(allIds.size());
      numSeqs_.push_back(numSeqs);
    }
  }
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@ -333,6 +333,10 @@ protected:
  };
  std::vector<Info> info_;
  // numSeqs_[i] is the number sequences which is longer than i (for sequence
  // data) or has more than i subsequences (for subsequence data)
  std::vector<int> numSeqs_;
  // each inlinks has a "std::vector<std::tuple<int, int, int, int>>" denotes
  // its sequence info:
  //  if hasSubSeq, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
--- a/paddle/gserver/tests/Sequence/dummy.list
+++ b/paddle/gserver/tests/Sequence/dummy.list
@ -0,0 +1 @@
 dummy_file_no_use
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@ -0,0 +1,35 @@
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.trainer.PyDataProvider2 import *
 data = [
    [[[1, 3, 2], [4, 5, 2]], 0],
    [[[0, 2], [2, 5], [0, 1, 2]], 1],
 ]
@provider(input_types=[integer_value_sub_sequence(10),
                       integer_value(2)])
 def process_subseq(settings, file_name):
    for d in data:
        yield d
@provider(input_types=[integer_value_sequence(10),
                       integer_value(2)])
 def process_seq(settings, file_name):
    for d in data:
        seq = []
        for subseq in d[0]:
            seq += subseq
        yield seq, d[1]
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@ -1,6 +1,3 @@
 #!/usr/bin/env python
 #coding=utf-8
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@ -0,0 +1,75 @@
 #edit-mode: -*- python -*-
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.trainer_config_helpers import *
 ######################## data source ################################
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                        test_list=None,
                        module='rnn_data_provider',
                        obj='process_subseq')
 settings(batch_size=2, learning_rate=0.01)
 ######################## network configure ################################
 dict_dim = 10
 word_dim = 8
 hidden_dim = 8
 label_dim = 3
 data = data_layer(name="word", size=dict_dim)
 emb = embedding_layer(input=data, size=word_dim)
 # This hierachical RNN is designed to be equivalent to the simple RNN in
 # sequence_rnn.conf
 def outer_step(x):
    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
    def inner_step(y):
        inner_mem = memory(name="inner_rnn_state",
                           size=hidden_dim,
                           boot_layer=outer_mem)
        return fc_layer(input=[y, inner_mem],
                        size=hidden_dim,
                        act=TanhActivation(),
                        bias_attr=True,
                        name="inner_rnn_state")
    inner_rnn_output = recurrent_group(
        step=inner_step,
        input=x)
    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
    # "return last" should also work. But currently RecurrentGradientMachine
    # does not handle it correctly. Current implementation requires that
    # all the out links are from sequences. However, it does not report error
    # when the out links are not sequences.
    return inner_rnn_output
 out = recurrent_group(
    step=outer_step,
    input=SubsequenceInput(emb))
 value_printer_evaluator(input=out)
 rep = last_seq(input=out)
 prob = fc_layer(size=label_dim,
                input=rep,
                act=SoftmaxActivation(),
                bias_attr=True)
 outputs(classification_cost(input=prob,
                            label=data_layer(name="label", size=label_dim)))
--- a/paddle/gserver/tests/sequence_rnn.conf
+++ b/paddle/gserver/tests/sequence_rnn.conf
@ -0,0 +1,57 @@
 #edit-mode: -*- python -*-
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.trainer_config_helpers import *
 ######################## data source ################################
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                        test_list=None,
                        module='rnn_data_provider',
                        obj='process_seq')
 settings(batch_size=2, learning_rate=0.01)
 ######################## network configure ################################
 dict_dim = 10
 word_dim = 8
 hidden_dim = 8
 label_dim = 3
 data = data_layer(name="word", size=dict_dim)
 emb = embedding_layer(input=data, size=word_dim)
 def step(y):
    mem = memory(name="rnn_state", size=hidden_dim)
    return fc_layer(input=[y, mem],
                    size=hidden_dim,
                    act=TanhActivation(),
                    bias_attr=True,
                    name="rnn_state")
 out = recurrent_group(
    step=step,
    input=emb)
 value_printer_evaluator(input=out)
 rep = last_seq(input=out)
 prob = fc_layer(size=label_dim,
                input=rep,
                act=SoftmaxActivation(),
                bias_attr=True)
 outputs(classification_cost(input=prob,
                            label=data_layer(name="label", size=label_dim)))
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@ -21,6 +21,8 @@ limitations under the License. */
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>
 P_DECLARE_int32(seed);
 using namespace paddle;  // NOLINT
 using namespace std;  // NOLINT
 class TrainerForTest : public paddle::Trainer {
@ -68,7 +70,9 @@ void CalCost(const string& conf, const string& dir, real* cost,
  CpuVector vecMomentum(dim);
  // vecW needs to be assigned, otherwise the variable is an uncertain value.
-  vecW.zeroMem();
+
  *ThreadLocalRand::getSeed() = FLAGS_seed;
  vecW.randnorm(0, 0.1);
  trainer.startTrain();
  for (int i = 0; i < num_passes; ++i) {
@ -88,15 +92,13 @@ void CalCost(const string& conf, const string& dir, real* cost,
  rmDir(dir.c_str());
 }
-TEST(RecurrentGradientMachine, HasSubSequence) {
+void test(const string& conf1, const string& conf2) {
  int num_passes = 5;
  real* cost1 = new real[num_passes];
  const string conf1 = "gserver/tests/sequence_layer_group.conf";
  const string dir1 = "gserver/tests/t1";
  CalCost(conf1, dir1, cost1, num_passes);
  real* cost2 = new real[num_passes];
  const string conf2 = "gserver/tests/sequence_nest_layer_group.conf";
  const string dir2 = "gserver/tests/t2";
  CalCost(conf2, dir2, cost2, num_passes);
@ -109,6 +111,17 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
  delete[] cost2;
 }
 TEST(RecurrentGradientMachine, HasSubSequence) {
  test("gserver/tests/sequence_layer_group.conf",
       "gserver/tests/sequence_nest_layer_group.conf");
 }
 TEST(RecurrentGradientMachine, rnn) {
  test("gserver/tests/sequence_rnn.conf",
       "gserver/tests/sequence_nest_rnn.conf");
 }
 int main(int argc, char** argv) {
  if (paddle::version::isWithPyDataProvider()) {
    if (!paddle::version::isWithGpu()) {
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@ -255,6 +255,15 @@ struct Argument {
  /*
   Get Sequence Length, startPositions and max Length according to input
   1. For sequence data:
      Each tuple is (seq_length, seq_start, seq_id, seq_id)
      The tuples are sorted according to seq_length or subseq_length
      *maxSequenceLength is the maximal sequence length
   2. For subsequence data:
      Each tuple is (subseq_length, subseq_start, seq_id, subseq_id)
      The tuples are not sorted. They are in the original order.
      *maxSequenceLenth is the maximal number of subsequences in each sequence.
   */
  void getSeqLengthAndStart(
      std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,