Merge remote-tracking branch 'upstream/develop' into factorization_machine_layer

7 years ago · a30d53b792
parent 8654e8a520 3ae9aa93c4
commit a30d53b792
12 changed files with 387 additions and 20 deletions
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@ -125,3 +125,8 @@ simple_attention
    :members: simple_attention
    :noindex:
 dot_product_attention
 ---------------------
 ..  automodule:: paddle.v2.networks
    :members: dot_product_attention
    :noindex:
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@ -21,6 +21,10 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #ifdef PADDLE_USE_MKLDNN
 #include "paddle/gserver/layers/MKLDNNLayer.h"
 #endif
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
 #include "RecurrentGradientMachine.h"
@ -300,6 +304,17 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
  }
 }
 void NeuralNetwork::finish() {
 #ifdef PADDLE_USE_MKLDNN
  FOR_EACH_R(layer, layers_) {
    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
    if (dnnLayer) {
      dnnLayer->convertWeightsToPaddle();
    }
  }
 #endif
 }
 Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
  return getLayer(layerName)->getOutput();
 }
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@ -134,6 +134,9 @@ public:
  const std::string& getName() const { return subModelName_; }
  /// some finish work, like convert the weight format of MKLDNNLayers
  void finish() override;
 protected:
  /**
   * The constructor of NeuralNetwork.
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@ -313,6 +313,7 @@ void MKLDNNConvLayer::resetOutValue(
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
      CHECK(cvtOutVal_) << "should not be empty";
    } else {
      cpuOut->setData(output_.value->getData());
      cpuOutVal_ = out;
    }
    // when output is cpu device, change the mkldnn output value and make them
@ -456,17 +457,18 @@ void MKLDNNConvLayer::resetOutGrad(
    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
    // always share the same grad data of CPU output
    // then the activation can get the right grad from output_.grad
    output_.grad->setData(cpuOut->getData());
    // same PrimitiveDesc with cpuInVal_
    CHECK(cpuOutVal_);
    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
    // create reorder if primitive desc does not match
    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_);
    } else {
      // share the same data of CPU output
      output_.grad->setData(cpuOut->getData());
      out = cpuOutGrad_;
    }
  }
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@ -46,6 +46,9 @@ protected:
  // backward also need reset after reset forward handle
  bool needResetBwd_;
  // is output only mkldnn
  bool outputOnlyMKLDNN_;
  // mkldnn engine, stream and primivtives
  mkldnn::engine engine_;
  std::shared_ptr<MKLDNNStream> stream_;
@ -141,6 +144,9 @@ public:
        updateInputData();
      }
      if (!outputOnlyMKLDNN_) {
        clearGrads();
      }
      stream_->submit(pipelineFwd_);
    }
@ -389,7 +395,8 @@ protected:
      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
          << "Only support other device is CPU yet";
    }
-    return outputOtherDevice_.size() == 0;
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
    return outputOnlyMKLDNN_;
  }
  /**
@ -398,6 +405,16 @@ protected:
  void setDevice(int id) { deviceId_ = id; }
 private:
  /**
   * clear all grad
   */
  void clearGrads() {
    output_.grad->zeroMem();
    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
      outputOtherDevice_[i].grad->zeroMem();
    }
  }
  /**
   * Set deviceId of the params used in this layer.
   */
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@ -146,6 +146,7 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
      CHECK(cvtOutVal_) << "should not be emptry";
    } else {
      cpuOut->setData(output_.value->getData());
      cpuOutVal_ = out;
    }
    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
@ -213,15 +214,16 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
    // always share the same grad data of CPU output
    // then the activation can get the right grad from output_.grad
    output_.grad->setData(cpuOut->getData());
    cpuOutGrad_ = MKLDNNMatrix::create(
        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_) << "should not be emptry";
    } else {
      // share the same data of CPU output
      output_.grad->setData(cpuOut->getData());
      out = cpuOutGrad_;
    }
  }
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@ -26,7 +26,10 @@ if(WITH_MKLDNN)
        test_MKLDNN.cpp
        MKLDNNTester.cpp
        LayerGradUtil.cpp)
-    add_test(NAME test_MKLDNN COMMAND test_MKLDNN)
+    add_test(NAME test_MKLDNN
        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
            ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 ################ test_CRFLayerGrad ####################
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@ -15,6 +15,7 @@ limitations under the License. */
 #include "MKLDNNTester.h"
 #include "paddle/gserver/layers/MKLDNNBase.h"
 #include "paddle/gserver/layers/MKLDNNLayer.h"
 #include "paddle/trainer/Trainer.h"
 namespace paddle {
@ -315,6 +316,7 @@ void MKLDNNTester::runOnce() {
    auto& value = para->getBuf(PARAMETER_VALUE);
    real lr = 1e-3;
    value->add(*grad, lr);
    grad->zeroMem();
  };
  randomTopDiffs();
  dnnLayer_->backward(updateCallback);
@ -411,4 +413,143 @@ void MKLDNNTester::run(const TestConfig& dnn,
  }
 }
 void MKLDNNTester::initArgument(DataIn& data,
                                const std::string& configPath,
                                const size_t iter) {
  TrainerConfigHelper config(configPath);
  size_t batchSize = config.getOptConfig().batch_size();
  data.inArgs.resize(iter);
  data.outGrads.resize(iter);
  data.paraValues.clear();
  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
                                     config.getModelConfig().layers().end(),
                                     [=](const LayerConfig& layer_config) {
                                       return layer_config.name() == layer_name;
                                     });
    CHECK(layer_config != config.getModelConfig().layers().end());
    size_t layerSize = layer_config->size();
    for (size_t i = 0; i < iter; ++i) {
      Argument arg;
      arg.value = Matrix::create(batchSize, layerSize, false, false);
      arg.grad = Matrix::create(batchSize, layerSize, false, false);
      arg.value->randomizeUniform();
      arg.value->add(-0.5);
      arg.value->sigmoid(*arg.value);
      arg.grad->zeroMem();
      arg.ids = VectorT<int>::create(batchSize, false);
      arg.ids->rand(layerSize);
      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
      data.inArgs[i].push_back(arg);
    }
  }
  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
                                     config.getModelConfig().layers().end(),
                                     [=](const LayerConfig& layer_config) {
                                       return layer_config.name() == layer_name;
                                     });
    CHECK(layer_config != config.getModelConfig().layers().end());
    size_t layerSize = layer_config->size();
    for (size_t i = 0; i < iter; ++i) {
      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
      grad->randomizeUniform();
      data.outGrads[i].push_back(grad);
    }
  }
  for (const auto& para_config : config.getModelConfig().parameters()) {
    VectorPtr value = Vector::create(para_config.size(), false);
    value->randnorm(0, 2);
    data.paraValues.push_back(value);
  }
 }
 void MKLDNNTester::getOutResult(const std::string& configPath,
                                DataIn& in,
                                DataOut& out,
                                bool use_mkldnn,
                                size_t iter) {
  FLAGS_use_gpu = false;
  FLAGS_use_mkldnn = use_mkldnn;
  *ThreadLocalRand::getSeed() = 1;
  srand(1);
  Trainer trainer;
  auto config = std::make_shared<TrainerConfigHelper>(configPath);
  trainer.init(config, false);
  auto gradientMachine = trainer.getGradientMachine();
  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
  for (size_t i = 0; i < in.paraValues.size(); i++) {
    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
  }
  UpdateCallback simpleUpdate = [](Parameter* para) {
    auto& grad = para->getBuf(PARAMETER_GRADIENT);
    auto& value = para->getBuf(PARAMETER_VALUE);
    real lr = 1e-2;
    value->add(*grad, lr);
    grad->zeroMem();
  };
  vector<Argument> outArgs;
  gradientMachine->start();
  out.outValues.clear();
  out.paraValues.clear();
  for (size_t i = 0; i < iter; ++i) {
    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
    // save forward result
    for (size_t k = 0; k < outArgs.size(); k++) {
      MatrixPtr value = Matrix::create(outArgs[k].value->getHeight(),
                                       outArgs[k].value->getWidth(),
                                       false,
                                       false);
      value->copyFrom(*outArgs[k].value);
      out.outValues.push_back(value);
    }
    // random backward input
    for (size_t k = 0; k < outArgs.size(); k++) {
      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
    }
    gradientMachine->backward(simpleUpdate);
  }
  gradientMachine->finish();
  // save param value
  for (size_t i = 0; i < in.paraValues.size(); i++) {
    VectorPtr val = Vector::create(
        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
    out.paraValues.push_back(val);
  }
 }
 void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
  for (size_t i = 0; i < ref.outValues.size(); i++) {
    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
  }
  for (size_t i = 0; i < ref.paraValues.size(); i++) {
    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
  }
 }
 void MKLDNNTester::runBranchesTest(const std::string& configPath,
                                   size_t iter,
                                   float eps) {
  DataIn in;
  initArgument(in, configPath, iter);
  DataOut outCpu, outDnn;
  getOutResult(configPath, in, outCpu, false, iter);
  getOutResult(configPath, in, outDnn, true, iter);
  compareResult(outCpu, outDnn, eps);
 }
 }  //  namespace paddle
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@ -33,6 +33,17 @@ class MKLDNNTester {
    NUM = 2,  // Number of total
  };
  struct DataIn {
    std::vector<std::vector<Argument>> inArgs;
    std::vector<std::vector<MatrixPtr>> outGrads;
    std::vector<VectorPtr> paraValues;
  };
  struct DataOut {
    std::vector<MatrixPtr> outValues;
    std::vector<VectorPtr> paraValues;
  };
 protected:
  std::vector<TestConfig> configs_;
  vector<string> layerNames_;
@ -74,7 +85,17 @@ public:
           float epsilon = 1e-4,
           bool log = false,
           int level = MKLDNN_ALL);
-  void setLogLevel(int lvl) { lvl_ = lvl; }
+  static void runBranchesTest(const std::string& configPath,
                              size_t iter = 3,
                              float eps = 1e-4);
  static void initArgument(DataIn& data,
                           const std::string& configPath,
                           size_t iter = 3);
  static void getOutResult(const std::string& configPath,
                           DataIn& in,
                           DataOut& out,
                           bool use_mkldnn,
                           size_t iter = 3);
 private:
  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
@ -101,8 +122,9 @@ private:
  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
-  double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
-  double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
  /**
   * Get delta percent
@ -111,7 +133,7 @@ private:
   * else return sum(abs(a-b)) / sum(abs(b))
   * The return value should be smaller than eps when passing.
   */
-  double getDelta(const real* d1,
+  static double getDelta(const real* d1,
                         const real* d2,
                         size_t len,
                         const float failRate = 1e-3,
--- a/paddle/gserver/tests/mkldnn_branches_conv.conf
+++ b/paddle/gserver/tests/mkldnn_branches_conv.conf
@ -0,0 +1,56 @@
 # Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.trainer_config_helpers import *
 settings(batch_size=16)
 channels = get_config_arg("channels", int, 2)
 def two_conv(input, group_name):
  out1 = img_conv_layer(input=input,
            name=group_name+'_conv1',
            filter_size=1,
            num_filters=channels,
            padding=0,
            shared_biases=True,
            act=ReluActivation())
  out2 = img_conv_layer(input=input,
            name=group_name+'_conv2',
            filter_size=3,
            num_filters=channels,
            padding=1,
            shared_biases=True,
            act=ReluActivation())
  return out1, out2
 data = data_layer(name ="input", size=channels*16*16)
 conv = img_conv_layer(input=data,
            num_channels=channels,
            filter_size=3,
            num_filters=channels,
            padding=1,
            shared_biases=True,
            act=ReluActivation())
 a1, a2 = two_conv(input=conv, group_name='a')
 concat = concat_layer(input=[a1, a2])
 b1, b2 = two_conv(input=conv, group_name='b')
 addto = addto_layer(input=[b1, b2])
 outputs([concat, addto])
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
 #include <string>
 #include <vector>
 #include "MKLDNNTester.h"
@ -40,12 +41,13 @@ DECLARE_bool(use_mkldnn);
 struct testFcDesc {
  int bs;
  int ic;
  int oc;
  int ih, iw;  // oh == ow == 1
  int oc;
 };
 static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
  cfg.layerConfig.set_type("mkldnn_fc");
  cfg.layerConfig.set_active_type("relu");
  cfg.layerConfig.set_size(pm.oc);
  cfg.inputDefs.push_back(
      {INPUT_DATA,
@ -86,6 +88,7 @@ struct testConvDesc {
 static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
  cfg.layerConfig.set_type("mkldnn_conv");
  cfg.layerConfig.set_active_type("relu");
  cfg.layerConfig.set_num_filters(pm.oc);
  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
  cfg.layerConfig.set_shared_biases(true);
@ -158,6 +161,7 @@ struct testPoolDesc {
 static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
  cfg.layerConfig.set_type("mkldnn_pool");
  cfg.layerConfig.set_active_type("relu");
  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
  cfg.inputDefs.push_back(
      {INPUT_DATA,
@ -244,13 +248,26 @@ TEST(MKLDNNActivation, Activations) {
  }
 }
-// TODO(TJ): add branch test
+DECLARE_string(config_args);
 TEST(MKLDNNLayer, branches) {
  std::vector<std::string> cases = {"conv"};
  for (auto name : cases) {
    std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf";
    for (auto channels : {2, 32}) {
      std::ostringstream oss;
      oss << "channels=" << channels;
      FLAGS_config_args = oss.str();
      MKLDNNTester::runBranchesTest(config);
    }
  }
 }
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  FLAGS_use_gpu = false;
  FLAGS_use_mkldnn = true;
  initMain(argc, argv);
  initPython(argc, argv);
  FLAGS_thread_local_rand_use_global_seed = true;
  srand(1);
  return RUN_ALL_TESTS();
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@ -26,8 +26,9 @@ __all__ = [
    'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
+    'simple_attention', 'dot_product_attention', 'simple_gru2',
-    'bidirectional_lstm', 'inputs', 'outputs'
+    'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
    'outputs'
 ]
 ######################################################
@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence,
                                compute attention weight.
    :type transform_param_attr: ParameterAttribute
    :return: a context vector
    :rtype: LayerOutput
    """
    assert encoded_proj.size == decoder_state.size
    proj_size = encoded_proj.size
@ -1396,6 +1398,88 @@ def simple_attention(encoded_sequence,
        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
@wrap_name_default()
 def dot_product_attention(encoded_sequence,
                          attended_sequence,
                          transformed_state,
                          softmax_param_attr=None,
                          name=None):
    """
    Calculate and return a context vector with dot-product attention mechanism.
    The dimension of the context vector equals to that of the attended_sequence.
    ..  math::
        a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
        e_{i,j} & = a(s_{i-1}, h_{j})
        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
    where :math:`h_{j}` is the jth element of encoded_sequence,
    :math:`z_{j}` is the jth element of attended_sequence,
    :math:`s_{i-1}` is transformed_state.
    The example usage is:
    ..  code-block:: python
        context = dot_product_attention(encoded_sequence=enc_seq,
                                        attended_sequence=att_seq,
                                        transformed_state=state,)
    :param name: A prefix attached to the name of each layer that defined inside
                 the dot_product_attention.
    :type name: basestring
    :param softmax_param_attr: The parameter attribute of sequence softmax
                               that is used to produce attention weight.
    :type softmax_param_attr: ParameterAttribute
    :param encoded_sequence: The output hidden vectors of the encoder.
    :type encoded_sequence: LayerOutput
    :param attended_sequence: The attention weight is computed by a feed forward neural
                              network which has two inputs : decoder's transformed hidden
                              state of previous time step and encoder's output.
                              attended_sequence is the sequence to be attended.
    :type attended_sequence: LayerOutput
    :param transformed_state: The transformed hidden state of decoder in previous time step.
                              Since the dot-product operation will be performed on it and the
                              encoded_sequence, their dimensions must be equal. For flexibility,
                              we suppose transformations of the decoder's hidden state have been
                              done outside dot_product_attention and no more will be performed
                              inside. Then users can use either the original or transformed one.
    :type transformed_state: LayerOutput
    :return: The context vector.
    :rtype: LayerOutput
    """
    assert transformed_state.size == encoded_sequence.size
    expanded = expand_layer(
        input=transformed_state,
        expanded_as=encoded_sequence,
        name='%s_expand' % name)
    m = linear_comb_layer(
        weights=expanded, vectors=encoded_sequence, name='%s_dot-product')
    attention_weight = fc_layer(
        input=m,
        size=1,
        act=SequenceSoftmaxActivation(),
        param_attr=softmax_param_attr,
        name="%s_softmax" % name,
        bias_attr=False)
    scaled = scaling_layer(
        weight=attention_weight,
        input=attended_sequence,
        name='%s_scaling' % name)
    return pooling_layer(
        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 def inputs(layers, *args):
    """
    Declare the inputs of network. The order of input should be as same as