commit
ef84ff8657
@ -0,0 +1,36 @@
|
|||||||
|
=====================
|
||||||
|
Data Reader Interface
|
||||||
|
=====================
|
||||||
|
|
||||||
|
|
||||||
|
DataTypes
|
||||||
|
=========
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.data_type
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
DataFeeder
|
||||||
|
==========
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.data_feeder
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
Reader
|
||||||
|
======
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.reader
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.reader.creator
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
minibatch
|
||||||
|
=========
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.minibatch
|
||||||
|
:members:
|
||||||
|
:noindex:
|
@ -0,0 +1,75 @@
|
|||||||
|
Dataset
|
||||||
|
=======
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
mnist
|
||||||
|
+++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.mnist
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
cifar
|
||||||
|
+++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.cifar
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
conll05
|
||||||
|
+++++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.conll05
|
||||||
|
:members: get_dict,get_embedding,test
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
imdb
|
||||||
|
++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.imdb
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
imikolov
|
||||||
|
++++++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.imikolov
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
movielens
|
||||||
|
+++++++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.movielens
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
.. autoclass:: paddle.v2.dataset.movielens.UserInfo
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
sentiment
|
||||||
|
+++++++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.sentiment
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
uci_housing
|
||||||
|
+++++++++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.uci_housing
|
||||||
|
:members:
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
wmt14
|
||||||
|
+++++
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.dataset.wmt14
|
||||||
|
:members:
|
||||||
|
:noindex:
|
@ -0,0 +1,5 @@
|
|||||||
|
Image Interface
|
||||||
|
===============
|
||||||
|
|
||||||
|
.. automodule:: paddle.v2.image
|
||||||
|
:members:
|
After Width: | Height: | Size: 61 KiB |
@ -0,0 +1,245 @@
|
|||||||
|
# Design: Sequence Decoder Generating LoDTensors
|
||||||
|
In tasks such as machine translation and image to text,
|
||||||
|
a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
|
||||||
|
|
||||||
|
This documentation describes how to implement the sequence decoder as an operator.
|
||||||
|
|
||||||
|
## Beam Search based Decoder
|
||||||
|
The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences,
|
||||||
|
it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
|
||||||
|
|
||||||
|
In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search,
|
||||||
|
due to the complexity, the implementation relays on a lot of special data structures,
|
||||||
|
quite trivial and hard to be customized by users.
|
||||||
|
|
||||||
|
There are a lot of heuristic tricks in the sequence generation tasks,
|
||||||
|
so the flexibility of sequence decoder is very important to users.
|
||||||
|
|
||||||
|
During PaddlePaddle's refactoring work,
|
||||||
|
some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
|
||||||
|
and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
|
||||||
|
|
||||||
|
For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
|
||||||
|
the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
|
||||||
|
|
||||||
|
## Changing LoD's absolute offset to relative offsets
|
||||||
|
The current `LoDTensor` is designed to store levels of variable-length sequences,
|
||||||
|
it stores several arrays of integers each represents a level.
|
||||||
|
|
||||||
|
The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
|
||||||
|
let's call this format the **absolute-offset LoD** for clear.
|
||||||
|
|
||||||
|
The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
|
||||||
|
```python
|
||||||
|
[[0, 3, 9]
|
||||||
|
[0, 2, 3, 3, 3, 9]]
|
||||||
|
```
|
||||||
|
The first level tells that there are two sequences:
|
||||||
|
- the first's offset is `[0, 3)`
|
||||||
|
- the second's offset is `[3, 9)`
|
||||||
|
|
||||||
|
while on the second level, there are several empty sequences that both begin and end at `3`.
|
||||||
|
It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
|
||||||
|
|
||||||
|
There are many scenarios that relay on empty sequence representation,
|
||||||
|
such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
|
||||||
|
|
||||||
|
So let's introduce another format of LoD,
|
||||||
|
it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
|
||||||
|
|
||||||
|
For example, to represent the same sequences of the above data
|
||||||
|
|
||||||
|
```python
|
||||||
|
[[0, 3, 6]
|
||||||
|
[0, 2, 3, 3, 3, 9]]
|
||||||
|
```
|
||||||
|
|
||||||
|
the first level represents that there are two sequences,
|
||||||
|
their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
|
||||||
|
|
||||||
|
The second level is the same with the relative offset example because the lower level is a tensor.
|
||||||
|
It is easy to find out the second sequence in the first-level LoD has two empty sequences.
|
||||||
|
|
||||||
|
The following demos are based on relative-offset LoD.
|
||||||
|
|
||||||
|
## Usage in a simple machine translation model
|
||||||
|
Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
|
||||||
|
|
||||||
|
The model has an encoder that learns the semantic vector from a sequence,
|
||||||
|
and a decoder which uses the sequence decoder to generate new sentences.
|
||||||
|
|
||||||
|
**Encoder**
|
||||||
|
```python
|
||||||
|
import paddle as pd
|
||||||
|
|
||||||
|
dict_size = 8000
|
||||||
|
source_dict_size = dict_size
|
||||||
|
target_dict_size = dict_size
|
||||||
|
word_vector_dim = 128
|
||||||
|
encoder_dim = 128
|
||||||
|
decoder_dim = 128
|
||||||
|
beam_size = 5
|
||||||
|
max_length = 120
|
||||||
|
|
||||||
|
# encoder
|
||||||
|
src_word_id = pd.data(
|
||||||
|
name='source_language_word',
|
||||||
|
type=pd.data.integer_value_sequence(source_dict_dim))
|
||||||
|
src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
|
||||||
|
|
||||||
|
src_word_vec = pd.lookup(src_embedding, src_word_id)
|
||||||
|
|
||||||
|
encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
|
||||||
|
|
||||||
|
encoder_ctx = pd.last_seq(encoder_out_seq)
|
||||||
|
# encoder_ctx_proj is the learned semantic vector
|
||||||
|
encoder_ctx_proj = pd.fc(
|
||||||
|
encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Decoder**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def generate():
|
||||||
|
decoder = pd.while_loop()
|
||||||
|
with decoder.step():
|
||||||
|
decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory
|
||||||
|
generated_ids = decoder.memory() # TODO init to batch_size <s>s
|
||||||
|
generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
|
||||||
|
|
||||||
|
target_word = pd.lookup(trg_embedding, gendrated_ids)
|
||||||
|
# expand encoder_ctx's batch to fit target_word's lod
|
||||||
|
# for example
|
||||||
|
# decoder_mem.lod is
|
||||||
|
# [[0 1 3],
|
||||||
|
# [0 1 3 6]]
|
||||||
|
# its tensor content is [a1 a2 a3 a4 a5]
|
||||||
|
# which means there are 2 sentences to translate
|
||||||
|
# - the first sentence has 1 translation prefixes, the offsets are [0, 1)
|
||||||
|
# - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
|
||||||
|
# the target_word.lod is
|
||||||
|
# [[0, 1, 6]
|
||||||
|
# [0, 2, 4, 7, 9 12]]
|
||||||
|
# which means 2 sentences to translate, each has 1 and 5 prefixes
|
||||||
|
# the first prefix has 2 candidates
|
||||||
|
# the following has 2, 3, 2, 3 candidates
|
||||||
|
# the encoder_ctx_expanded's content will be
|
||||||
|
# [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
|
||||||
|
encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
|
||||||
|
decoder_input = pd.fc(
|
||||||
|
act=pd.activation.Linear(),
|
||||||
|
input=[target_word, encoder_ctx],
|
||||||
|
size=3 * decoder_dim)
|
||||||
|
gru_out, cur_mem = pd.gru_step(
|
||||||
|
decoder_input, mem=decoder_mem, size=decoder_dim)
|
||||||
|
scores = pd.fc(
|
||||||
|
gru_out,
|
||||||
|
size=trg_dic_size,
|
||||||
|
bias=None,
|
||||||
|
act=pd.activation.Softmax())
|
||||||
|
# K is an config
|
||||||
|
topk_scores, topk_ids = pd.top_k(scores, K)
|
||||||
|
topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
|
||||||
|
|
||||||
|
selected_ids, selected_generation_scores = decoder.beam_search(
|
||||||
|
topk_ids, topk_generated_scores)
|
||||||
|
|
||||||
|
# update the states
|
||||||
|
decoder_mem.update(cur_mem) # tells how to update state
|
||||||
|
generated_ids.update(selected_ids)
|
||||||
|
generated_scores.update(selected_generation_scores)
|
||||||
|
|
||||||
|
decoder.output(selected_ids)
|
||||||
|
decoder.output(selected_generation_scores)
|
||||||
|
|
||||||
|
translation_ids, translation_scores = decoder()
|
||||||
|
```
|
||||||
|
The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
|
||||||
|
return the result of the beam search algorithm.
|
||||||
|
|
||||||
|
In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
|
||||||
|
|
||||||
|
1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
|
||||||
|
2. remove some specific candidate in `selected_ids`
|
||||||
|
3. get the final `translation_ids`, remove the translation sequence in it.
|
||||||
|
|
||||||
|
The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
|
||||||
|
so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
|
||||||
|
|
||||||
|
Both of them are two-level `LoDTensors`
|
||||||
|
|
||||||
|
- the first level represents `batch_size` of (source) sentences;
|
||||||
|
- the second level represents the candidate ID sets for translation prefix.
|
||||||
|
|
||||||
|
for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
|
||||||
|
|
||||||
|
Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
|
||||||
|
a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
|
||||||
|
|
||||||
|
For example, the previous state
|
||||||
|
|
||||||
|
* LoD is `[0, 1, 3][0, 2, 5, 6]`
|
||||||
|
* content of tensor is `a1 a2 b1 b2 b3 c1`
|
||||||
|
|
||||||
|
the current state stored in `encoder_ctx_expanded`
|
||||||
|
|
||||||
|
* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
|
||||||
|
* the content is
|
||||||
|
- a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
|
||||||
|
- a2 a2
|
||||||
|
- b1 b1 b1
|
||||||
|
- b2
|
||||||
|
- b3 b3
|
||||||
|
- None (c1 has 0 candidates, so c1 is dropped)
|
||||||
|
|
||||||
|
Benefit from the relative offset LoD, empty candidate set can be represented naturally.
|
||||||
|
|
||||||
|
the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is
|
||||||
|
|
||||||
|
```python
|
||||||
|
decoder.output(selected_ids)
|
||||||
|
decoder.output(selected_generation_scores)
|
||||||
|
```
|
||||||
|
|
||||||
|
the `selected_ids` is the candidate ids for the prefixes,
|
||||||
|
it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
|
||||||
|
the first level represents the source sequences,
|
||||||
|
the second level represents generated sequences.
|
||||||
|
|
||||||
|
Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
|
||||||
|
|
||||||
|
Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
|
||||||
|
|
||||||
|
## LoD and shape changes during decoding
|
||||||
|
<p align="center">
|
||||||
|
<img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
According the image above, the only phrase to change LoD is beam search.
|
||||||
|
|
||||||
|
## Beam search design
|
||||||
|
The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
|
||||||
|
|
||||||
|
1. `topk_ids`, top K candidate ids for each prefix.
|
||||||
|
2. `topk_scores`, the corresponding scores for `topk_ids`
|
||||||
|
3. `generated_scores`, the score of the prefixes.
|
||||||
|
|
||||||
|
All of the are LoDTensors, so that the sequence affilication is clear.
|
||||||
|
Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
|
||||||
|
|
||||||
|
It will return three variables
|
||||||
|
|
||||||
|
1. `selected_ids`, the final candidate beam search function selected for the next step.
|
||||||
|
2. `selected_scores`, the scores for the candidates.
|
||||||
|
3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
|
||||||
|
|
||||||
|
## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
|
||||||
|
The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
|
||||||
|
and they exist in each time step,
|
||||||
|
so it is natural to store them in arrays.
|
||||||
|
|
||||||
|
Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
|
||||||
|
the results of beam search are better to store in a `TensorArray`.
|
||||||
|
|
||||||
|
The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors.
|
||||||
|
It needs some extensions to support pack or unpack an array of `LoDTensors`.
|
@ -0,0 +1,155 @@
|
|||||||
|
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#include "ScaleSubRegionOp.h"
|
||||||
|
#include "paddle/function/TensorShape.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
|
||||||
|
template <>
|
||||||
|
void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
|
||||||
|
const real* inputs,
|
||||||
|
const real* indices,
|
||||||
|
const TensorShape shape,
|
||||||
|
const FuncConfig& conf) {
|
||||||
|
real value = conf.get<real>("value");
|
||||||
|
|
||||||
|
int number = shape[0];
|
||||||
|
int channel = shape[1];
|
||||||
|
int height = shape[2];
|
||||||
|
int width = shape[3];
|
||||||
|
|
||||||
|
memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
|
||||||
|
|
||||||
|
for (int n = 0; n < number; ++n) {
|
||||||
|
// indices start from 1
|
||||||
|
int offset = n * 6;
|
||||||
|
for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
|
||||||
|
for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
|
||||||
|
for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
|
||||||
|
int idx = ((n * channel + c) * height + h) * width + w;
|
||||||
|
outputs[idx] *= value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
|
||||||
|
real* outGrad,
|
||||||
|
const real* indices,
|
||||||
|
const TensorShape shape,
|
||||||
|
const FuncConfig& conf) {
|
||||||
|
real value = conf.get<real>("value");
|
||||||
|
|
||||||
|
int number = shape[0];
|
||||||
|
int channel = shape[1];
|
||||||
|
int height = shape[2];
|
||||||
|
int width = shape[3];
|
||||||
|
|
||||||
|
for (int n = 0; n < number; ++n) {
|
||||||
|
for (int c = 0; c < channel; ++c) {
|
||||||
|
for (int h = 0; h < height; ++h) {
|
||||||
|
for (int w = 0; w < width; ++w) {
|
||||||
|
int idx = ((n * channel + c) * height + h) * width + w;
|
||||||
|
int offset = n * 6;
|
||||||
|
if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
|
||||||
|
h >= (indices[offset + 2] - 1) &&
|
||||||
|
h <= (indices[offset + 3] - 1) &&
|
||||||
|
w >= (indices[offset + 4] - 1) &&
|
||||||
|
w <= (indices[offset + 5] - 1)) {
|
||||||
|
outGrad[idx] += inGrad[idx] * value;
|
||||||
|
} else {
|
||||||
|
outGrad[idx] += inGrad[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief For each instance, ScaleSubRegion can be used to multiply a value to
|
||||||
|
* a specified sub continuous region. By providing start index and end
|
||||||
|
* index for C/H/W, you can specify the location and shape of the region.
|
||||||
|
*
|
||||||
|
* Argument in this Function:
|
||||||
|
* \param inputs A 4-D tensor with shape [N, C, H, W], only one input.
|
||||||
|
* \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
|
||||||
|
* \param outputs A 4-D tensor with same shape as inputs, output value.
|
||||||
|
*/
|
||||||
|
template <DeviceType Device>
|
||||||
|
class ScaleSubRegionFunc : public FunctionBase {
|
||||||
|
public:
|
||||||
|
void init(const FuncConfig& config) override { conf_ = config; }
|
||||||
|
|
||||||
|
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
|
||||||
|
CHECK_EQ(2UL, inputs.size());
|
||||||
|
CHECK_EQ(1UL, outputs.size());
|
||||||
|
CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
|
||||||
|
|
||||||
|
TensorShape shape = inputs[0].shape();
|
||||||
|
|
||||||
|
ScaleSubRegion<Device>(outputs[0].data<real>(),
|
||||||
|
inputs[0].data<real>(),
|
||||||
|
inputs[1].data<real>(),
|
||||||
|
shape,
|
||||||
|
conf_);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
FuncConfig conf_;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief The backward propagation of ScaleSubRegion Function.
|
||||||
|
*
|
||||||
|
* Argument in this Function:
|
||||||
|
* \param inputs A 4-D tensor with shape [N, C, H, W], output gradient.
|
||||||
|
* \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
|
||||||
|
* \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <DeviceType Device>
|
||||||
|
class ScaleSubRegionGradFunc : public FunctionBase {
|
||||||
|
public:
|
||||||
|
void init(const FuncConfig& config) override { conf_ = config; }
|
||||||
|
|
||||||
|
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
|
||||||
|
CHECK_EQ(2UL, inputs.size());
|
||||||
|
CHECK_EQ(1UL, outputs.size());
|
||||||
|
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
|
||||||
|
|
||||||
|
TensorShape shape = inputs[0].shape();
|
||||||
|
|
||||||
|
ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
|
||||||
|
outputs[0].data<real>(),
|
||||||
|
inputs[1].data<real>(),
|
||||||
|
shape,
|
||||||
|
conf_);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
FuncConfig conf_;
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
|
||||||
|
REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
|
||||||
|
#ifdef PADDLE_WITH_CUDA
|
||||||
|
REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
|
||||||
|
REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace paddle
|
@ -0,0 +1,55 @@
|
|||||||
|
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "Function.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Function to multiply a value to values in specified sub continuous
|
||||||
|
* region. Indices must be provided to indcate the location and shape of
|
||||||
|
* the region and the multiplied value is passed by configure variable.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* \param[out] outputs Output value.
|
||||||
|
* \param[in] inputs Input data which contains NCHW information.
|
||||||
|
* \param[in] indices Indices data to indcate the sub region.
|
||||||
|
* \param[in] shape Tensor shape of input value.
|
||||||
|
* \param[in] conf Configure variable which contains the multiplied value.
|
||||||
|
*/
|
||||||
|
template <DeviceType Device>
|
||||||
|
void ScaleSubRegion(real* outputs,
|
||||||
|
const real* inputs,
|
||||||
|
const real* indices,
|
||||||
|
const TensorShape shape,
|
||||||
|
const FuncConfig& conf);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Backward propagation function of ScaleSubRegion.
|
||||||
|
*
|
||||||
|
* \param[out] inGrad Gradients of previous layer.
|
||||||
|
* \param[in] outGrad Output gradient.
|
||||||
|
* \param[in] indices Indices data.
|
||||||
|
* \param[in] shape The Shape of input tensor.
|
||||||
|
* \param[in] conf Configure variable.
|
||||||
|
*/
|
||||||
|
template <DeviceType Device>
|
||||||
|
void ScaleSubRegionGrad(const real* inGrad,
|
||||||
|
real* outGrad,
|
||||||
|
const real* indices,
|
||||||
|
const TensorShape shape,
|
||||||
|
const FuncConfig& conf);
|
||||||
|
} // namespace paddle
|
@ -0,0 +1,116 @@
|
|||||||
|
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#include "ScaleSubRegionOp.h"
|
||||||
|
#include "hl_base.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
|
||||||
|
__global__ void KeScaleSubRegion(real* outputs,
|
||||||
|
const real* inputs,
|
||||||
|
const real* indices,
|
||||||
|
real value,
|
||||||
|
int channel,
|
||||||
|
int height,
|
||||||
|
int width,
|
||||||
|
int nthreads) {
|
||||||
|
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (idx < nthreads) {
|
||||||
|
const int w = idx % width;
|
||||||
|
const int h = (idx / width) % height;
|
||||||
|
const int c = (idx / width / height) % channel;
|
||||||
|
const int n = idx / width / height / channel;
|
||||||
|
|
||||||
|
const int offset = n * 6;
|
||||||
|
if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
|
||||||
|
h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
|
||||||
|
w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
|
||||||
|
outputs[idx] = inputs[idx] * value;
|
||||||
|
} else {
|
||||||
|
outputs[idx] = inputs[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
|
||||||
|
const real* inputs,
|
||||||
|
const real* indices,
|
||||||
|
const TensorShape shape,
|
||||||
|
const FuncConfig& conf) {
|
||||||
|
real value = conf.get<real>("value");
|
||||||
|
|
||||||
|
int number = shape[0];
|
||||||
|
int channel = shape[1];
|
||||||
|
int height = shape[2];
|
||||||
|
int width = shape[3];
|
||||||
|
|
||||||
|
size_t nth = number * channel * height * width;
|
||||||
|
int blockSize = 1024;
|
||||||
|
int gridSize = (nth + blockSize - 1) / blockSize;
|
||||||
|
|
||||||
|
KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
|
||||||
|
outputs, inputs, indices, value, channel, height, width, nth);
|
||||||
|
CHECK_SYNC("ScaleSubRegion");
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void KeScaleSubRegionDiff(const real* inGrad,
|
||||||
|
real* outGrad,
|
||||||
|
const real* indices,
|
||||||
|
real value,
|
||||||
|
int channel,
|
||||||
|
int height,
|
||||||
|
int width,
|
||||||
|
int nthreads) {
|
||||||
|
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (idx < nthreads) {
|
||||||
|
const int w = idx % width;
|
||||||
|
const int h = (idx / width) % height;
|
||||||
|
const int c = (idx / width / height) % channel;
|
||||||
|
const int n = idx / width / height / channel;
|
||||||
|
|
||||||
|
const int offset = n * 6;
|
||||||
|
if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
|
||||||
|
h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
|
||||||
|
w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
|
||||||
|
outGrad[idx] += inGrad[idx] * value;
|
||||||
|
} else {
|
||||||
|
outGrad[idx] += inGrad[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
|
||||||
|
real* outGrad,
|
||||||
|
const real* indices,
|
||||||
|
const TensorShape shape,
|
||||||
|
const FuncConfig& conf) {
|
||||||
|
real value = conf.get<real>("value");
|
||||||
|
|
||||||
|
int number = shape[0];
|
||||||
|
int channel = shape[1];
|
||||||
|
int height = shape[2];
|
||||||
|
int width = shape[3];
|
||||||
|
|
||||||
|
size_t nth = number * channel * height * width;
|
||||||
|
int blockSize = 1024;
|
||||||
|
int gridSize = (nth + blockSize - 1) / blockSize;
|
||||||
|
|
||||||
|
KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
|
||||||
|
inGrad, outGrad, indices, value, channel, height, width, nth);
|
||||||
|
CHECK_SYNC("ScaleSubRegionGrad");
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace paddle
|
@ -0,0 +1,72 @@
|
|||||||
|
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. */
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include "FunctionTest.h"
|
||||||
|
|
||||||
|
namespace paddle {
|
||||||
|
|
||||||
|
TEST(ScaleSubRegion, real) {
|
||||||
|
for (size_t numSamples : {5, 32}) {
|
||||||
|
for (size_t channels : {5, 32}) {
|
||||||
|
for (size_t imgSizeH : {5, 33}) {
|
||||||
|
for (size_t imgSizeW : {5, 32}) {
|
||||||
|
for (real value : {-0.5, 0.0, 0.5}) {
|
||||||
|
for (bool firstHalf : {false, true}) {
|
||||||
|
VLOG(3) << " numSamples=" << numSamples
|
||||||
|
<< " channels=" << channels << " imgSizeH=" << imgSizeH
|
||||||
|
<< " imgSizeW=" << imgSizeW;
|
||||||
|
|
||||||
|
for (bool testGrad : {false, true}) {
|
||||||
|
CpuGpuFuncCompare compare(
|
||||||
|
testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
|
||||||
|
FuncConfig().set<real>("value", value));
|
||||||
|
|
||||||
|
TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
|
||||||
|
TensorShape indicesShape{numSamples, 6};
|
||||||
|
|
||||||
|
compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
|
||||||
|
compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
|
||||||
|
|
||||||
|
compare.registerInitCallback([=](BufferArg& arg, size_t index) {
|
||||||
|
if (index == 1) {
|
||||||
|
real* data = (real*)arg.data();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < numSamples; ++i) {
|
||||||
|
size_t offset = i * 6;
|
||||||
|
data[offset] = firstHalf ? 1 : channels / 2;
|
||||||
|
data[offset + 1] = firstHalf ? channels / 2 : channels;
|
||||||
|
data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
|
||||||
|
data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
|
||||||
|
data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
|
||||||
|
data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
compare.addOutputs(
|
||||||
|
BufferArg(
|
||||||
|
VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
|
||||||
|
testGrad ? ADD_TO : ASSIGN_TO);
|
||||||
|
compare.run();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue