Fix bug and Python API.

dangqingqing 8 years ago
parent b783e08ea0
commit 18cd1f2558

@ -61,7 +61,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
size_t begin = starts[i];
size_t end = starts[i + 1];
size_t steps = end - begin;
for (size_t j = 0; j < contextLength; ++j) {
for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
MatrixPtr x =
(const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
MatrixPtr dy =
@ -81,7 +81,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
for (size_t j = 0; j < steps; ++j) {
MatrixPtr dx = inG.subMatrix(begin + j, 1);
for (size_t t = 0; t < contextLength; ++t) {
if ((int(j) - int(t)) >= 0) {
if (int(j - t) >= 0) {
MatrixPtr dy =
(const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
@ -94,8 +94,37 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
* \brief TODO(qingqing)
* \brief The row convolution is called lookahead convolution. It is firstly
* introduced in deep-speech2 system. The bidirectional RNN that learns
* representation for a sequence by performing a forward and a backward pass
* through the entire sequence. However, unlike unidirectional RNNs,
* bidirectional RNNs are challenging to deploy in an online and low-latency
* setting. The lookahead convolution incorporates information from future
* subsequences in a computationally efficient manner to improve unidirectional
* recurrent neural networks.
* The connection of row convolution is different form the 1D sequence
* convolution. Assumed that, the future context-length is k, that is to say,
* it can get the output at timestep t by using the the input feature from t-th
* timestep to (t+k)-th timestep. Assumed that the hidden dim of input
* activations are d, the activations r_t for the new layer at time-step t are:
* -- k + 1
* r(t,i) = > W(i,j) * h(t+j-1, i), for (1 <= i <= d)
* -- j = 1
* The weight shape is: (k + 1) x d
* Function Arguments:
* \param inputs[0] The input activations.
* \param inputs[0] The filter (or weight) and shape is (k+1) x d.
* \param outputs[1] The output activations.
* [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
* English
* and Mandarin.
template <DeviceType Device>
@ -128,10 +157,21 @@ public:
RowConv<Device>(outMat, inMat, wMat, seqId);
* \brief TODO(qingqing)
* \brief The backward of row convolution function. This function calculated
* the gradient w.r.t filter and the gradient w.r.t input activations(or data).
* Argument in this Function:
* \param inputs[0] The gradient w.r.t output activations.
* \param inputs[1] The input activations.
* \param inputs[2] The filter (or weight) and shape is (k+1) x d.
* \param outputs[0] The gradient w.r.t input activations.
* \param outputs[1] The gradient w.r.r filter.
* Abbreviation:
* w.r.t: with respect to.
template <DeviceType Device>
@ -140,12 +180,27 @@ public:
void init(const FuncConfig& config) override {}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
// check
CHECK_EQ(3UL, inputs.size());
CHECK_EQ(2UL, outputs.size());
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
CHECK_EQ(outputs[1].getArgType(), ADD_TO);
CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
<< "SequenceArg required here.";
const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
const auto w = inputs[2];
auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
auto wGrad = outputs[1];
CHECK_EQ(in.shape().ndims(), 2UL);
CHECK_EQ(outGrad.shape().ndims(), 2UL);
CHECK_EQ(in.shape()[1], outGrad.shape()[1]);
CHECK_EQ(in.shape()[0], outGrad.shape()[0]);
CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
const auto outGMat = outGrad.matrix<Device>();
const auto inMat = in.matrix<Device>();
const auto wMat = w.matrix<Device>();
@ -157,37 +212,7 @@ public:
: typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
const auto seqId = in.getSequenceId().vector<int, Device>();
std::cout << "in:" << std::endl;
for (int i = 0; i < inMat.getHeight(); ++i) {
for (int j = 0; j < inMat.getWidth(); ++j) {
std::cout << outGMat.getElement(i, j) << " ";
std::cout << std::endl;
std::cout << "w:" << std::endl;
for (int i = 0; i < wMat.getHeight(); ++i) {
for (int j = 0; j < wMat.getWidth(); ++j) {
std::cout << wMat.getElement(i, j) << " ";
std::cout << std::endl;
std::cout << "w:" << std::endl;
for (int i = 0; i < seqId.getSize(); ++i) {
std::cout << seqId.getElement(i) << " ";
std::cout << std::endl;
RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
std::cout << std::endl << "out:" << std::endl;
for (int i = 0; i < inGMat.getHeight(); ++i) {
for (int j = 0; j < inGMat.getWidth(); ++j) {
std::cout << inGMat.getElement(i, j) << " ";
std::cout << std::endl;

@ -19,7 +19,14 @@ limitations under the License. */
namespace paddle {
* \brief TODO(qingqing)
* \brief The forward of row convolution.
* \param[out] out The output data and shape is h x d. h is the sum of
* time steps of all samples in one mini-batch.
* \param[in] in The input data and shape is h x d.
* \param[in] filter The filter and shape is k x d. The lookahead step
* number plus one equals k.
* \param[in] seq The sequence start positions.
template <DeviceType DType>
@ -29,7 +36,14 @@ void RowConv(typename Tensor<real, DType>::Matrix& out,
const typename Tensor<int, DType>::Vector& seq);
* \brief TODO(qingqing)
* \brief The backward of row convolution.
* \param[in] outG The gradient w.r.t output data.
* \param[in] in The input data.
* \param[in] filter The filter.
* \param[out] inG The gradient w.r.t input data.
* \param[out] filterG The gradient w.r.t filter.
* \param[in] seq The sequence start positions.
template <DeviceType DType>

@ -96,11 +96,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
const size_t height = in.getHeight();
const size_t width = in.getWidth();
LOG(INFO) << numSeq;
LOG(INFO) << contextLength;
LOG(INFO) << height;
LOG(INFO) << width;
real* y = out.getData();
const real* x = in.getData();
const real* w = filter.getData();
@ -108,7 +103,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
dim3 dimBlock(32, 32);
dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
LOG(INFO) << dimGrid.x;
if (contextLength <= 32) {
KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
@ -131,12 +125,12 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const int blky = blockDim.y;
const int gidx = blockIdx.x * blockDim.x;
__shared__ real sh_x[BLOCK_H][BLOCK_W];
__shared__ real sh_dy[BLOCK_H][BLOCK_W];
__shared__ real sh_x[BLOCK_W][BLOCK_H];
__shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
__shared__ real sh_dw[CONTEXT][BLOCK_W];
for (int t = tidy; t < context; t += blky) {
sh_dw[t][tidx] = 0.0;
if (tidy < context) {
sh_dw[tidy][tidx] = 0.0;
@ -144,21 +138,31 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const int start = starts[i];
const int end = starts[i + 1];
const int steps = end - start;
for (int j = tidy; j < steps; j += BLOCK_H) {
const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
for (int j = tidy; j < size; j += BLOCK_H) {
int xoff = gidx + tidx;
int yoff = start + j;
// transpose
sh_x[tidx][tidy] = xoff < width && yoff < end ? x[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy] = xoff < width && yoff < end ? dy[yoff * width + xoff] : 0.0;
sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
if (tidy < (context - 1)) {
yoff = yoff - context + 1;
sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
for (int t = 0; t < context; t++) {
real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0;
real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
// warp size and blockDim.x is 32.
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down(val, offset);
val += __shfl_down(val, 16);
val += __shfl_down(val, 8);
val += __shfl_down(val, 4);
val += __shfl_down(val, 2);
val += __shfl_down(val, 1);
if (tidx == 0) {
sh_dw[t][tidy] += val;
@ -167,7 +171,7 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
for (int t = tidy; t < context && (gidx + tidx) < width; t += blky) {
for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
dw[t * width + gidx + tidx] += sh_dw[t][tidx];
@ -188,21 +192,30 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
const int start = starts[i];
const int end = starts[i + 1];
const int steps = end - start;
for (int j = 0; j < steps; j += BLOCK_H) {
const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
for (int j = tidy; j < size; j += BLOCK_H) {
int xoff = gidx + tidx;
int yoff = start + j;
// transpose
sh_x[tidx][tidy] = xoff < width && yoff < end ? x[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy] = xoff < width && yoff < end ? dy[yoff * width + xoff] : 0.0;
sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
for (int t = 0; t < context; t++) {
real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0;
sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
// warp size and blockDim.x is 32.
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down(val, offset);
val += __shfl_down(val, 16);
val += __shfl_down(val, 8);
val += __shfl_down(val, 4);
val += __shfl_down(val, 2);
val += __shfl_down(val, 1);
if (tidx == 0 && (gidx + tidy) < width) {
dw[t*width + gidx + tidy] += val;
@ -293,34 +306,36 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
const real* dy = outG.getData();
const real* x = in.getData();
const real* w = filter.getData();
real* dx = inG.getData();
real* dw = filterG.getData();
const int* starts = seq.getData();
dim3 dimBlock(32, 32);
dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
if (contextLength <= 16) {
KeRowConvBwWeight<32, 32, 16>
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(dw, x, dy, starts, height, width, numSeq, contextLength);
} else {
KeRowConvBwWeight2<32, 32>
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(dw, x, dy, starts, height, width, numSeq, contextLength);
if (filterG) {
dim3 dimBlock(32, 32);
dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
real* dw = filterG.getData();
if (contextLength <= 16) {
KeRowConvBwWeight<32, 32, 16>
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(dw, x, dy, starts, height, width, numSeq, contextLength);
} else {
KeRowConvBwWeight2<32, 32>
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(dw, x, dy, starts, height, width, numSeq, contextLength);
dim3 dimBlock2(32, 32);
dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
if (contextLength <= 64) {
KeRowConvBwData<32, 64>
<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
(dx, w, dy, starts, height, width, numSeq, contextLength);
} else {
<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
(dx, w, dy, starts, height, width, numSeq, contextLength);
if (inG) {
real* dx = inG.getData();
dim3 dimBlock2(32, 32);
dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
if (contextLength <= 64) {
KeRowConvBwData<32, 64>
<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
(dx, w, dy, starts, height, width, numSeq, contextLength);
} else {
<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
(dx, w, dy, starts, height, width, numSeq, contextLength);

@ -47,23 +47,16 @@ void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
TEST(RowConv, real) {
// for (size_t numSamples : {17, 129}) {
// for (size_t dim : {16, 248}) {
// for (size_t context: {3, 7, 65}) {
LOG(INFO) << "===========";
// for (size_t numSamples : {17}) {
// for (size_t dim : {16}) {
// for (size_t context: {3}) {
size_t numSamples = 17;
size_t dim = 16;
size_t context = 3;
LOG(INFO) << " numSamples=" << numSamples << " dim=" << dim
<< " context length=" << context;
testRowConvFw(numSamples, dim, context);
// testRowConvBw(numSamples, dim, context);
// }
// }
// }
for (size_t numSamples : {17, 129, 2020}) {
for (size_t dim : {16, 512, 2560}) {
for (size_t context : {3, 19, 65}) {
VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
<< " context length=" << context;
testRowConvFw(numSamples, dim, context);
testRowConvBw(numSamples, dim, context);
} // namespace paddle

@ -75,7 +75,7 @@ void RowConvLayer::backward(const UpdateCallback& callback) {
BufferArgs outputs;
inputs.addArg(*getOutputGrad(), *startPos);
inputs.addArg(*getInputValue(0), *startPos);
inputs.addArg(*weight_->getW(), *startPos);
inputs.addArg(*weight_->getW(), wDims_);
MatrixPtr inGrad = getInputGrad(0);
MatrixPtr wGrad = weight_->getWGrad();

@ -37,9 +37,7 @@ protected:
// fan_out is the size of output feature.
std::unique_ptr<Weight> weight_;
// std::unique_ptr<Weight> biases_;
// how many steps to look ahead
// The step number to look ahead plus one equals contexLength_.
size_t contexLength_;
TensorShape wDims_;

@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase):
g_layer_map[].width, out_channels)
class RowConvLayer(LayerBase):
def __init__(self, name, inputs, context_length, **xargs):
super(RowConvLayer, self).__init__(
name, 'maxout', 0, inputs=inputs, **xargs)
len(self.inputs) == 1,
'TransLayer must have one and only one input')
input_layer = self.get_input_layer(0)
row_conv_conf = self.config.inputs[0].row_conv_conf
row_conv_conf.context_length = context_length
psize = context_length * input_layer.size
dims = [context_length, input_layer.size]
self.create_input_parameter(0, psize, dims)
# key: cost type
# value: cost class
g_cost_map = {}

@ -120,6 +120,7 @@ __all__ = [
@ -187,6 +188,7 @@ class LayerType(object):
SPP_LAYER = "spp"
PAD_LAYER = "pad"
MULTIPLEX_LAYER = "multiplex"
ROW_CONV_LAYER = "row_conv"
PRINT_LAYER = "print"
PRIORBOX_LAYER = "priorbox"
@ -5528,3 +5530,77 @@ def multiplex_layer(input, name=None, layer_attr=None):
def row_conv_layer(input,
The row convolution is called lookahead convolution. It is firstly
introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition
in English and Mandarin <>`_ .
The bidirectional RNN that learns representation for a sequence by
performing a forward and a backward pass through the entire sequence.
However, unlike unidirectional RNNs, bidirectional RNNs are challenging
to deploy in an online and low-latency setting. The lookahead convolution
incorporates information from future subsequences in a computationally
efficient manner to improve unidirectional recurrent neural networks.
The connection of row convolution is different form the 1D sequence
convolution. Assumed that, the future context-length is k, that is to say,
it can get the output at timestep t by using the the input feature from t-th
timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
activations are d, the activations r_t for the new layer at time-step t are:
.. math::
r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
\quad \text{for} \quad (1 \leq i \leq d)
The `context_len` is `k + 1`. That is to say, the lookahead step
number plus one equals context_len.
.. code-block:: python
row_conv = row_conv_layer(input=input_layer, context_len=3)
:param input: The input layer.
:type input: LayerOutput
:param context_len: The context length equals the lookahead step number
plus one.
:type context_len: int
:param act: Activation Type. Default is linear activation.
:type act: BaseActivation
:param param_attr: The Parameter Attribute. If None, the parameter will be
initialized smartly. It's better set it by yourself.
:type param_attr: ParameterAttribute
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
assert isinstance(input, LayerOutput)
assert context_len > 0, "the context_len must be greatet than 0."
inputs=[Input(, **param_attr.attr)],
return LayerOutput(
name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)

@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer)
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_row_conv)
export whole_configs=(test_split_datasource)

@ -0,0 +1,41 @@
type: "nn"
layers {
name: "data"
type: "data"
size: 2560
active_type: ""
layers {
name: "__row_conv_layer_0__"
type: "maxout"
size: 2560
active_type: "relu"
inputs {
input_layer_name: "data"
input_parameter_name: "___row_conv_layer_0__.w0"
row_conv_conf {
context_length: 19
parameters {
name: "___row_conv_layer_0__.w0"
size: 48640
initial_mean: 0.0
initial_std: 0.229415733871
dims: 19
dims: 2560
initial_strategy: 0
initial_smart: true
input_layer_names: "data"
output_layer_names: "__row_conv_layer_0__"
sub_models {
name: "root"
layer_names: "data"
layer_names: "__row_conv_layer_0__"
input_layer_names: "data"
output_layer_names: "__row_conv_layer_0__"
is_recurrent_layer_group: false

@ -0,0 +1,9 @@
from paddle.trainer_config_helpers import *
settings(batch_size=1000, learning_rate=1e-5)
data = data_layer(name='data', size=2560)
row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation())