Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into docker_new
commit
7606680dc2
@ -0,0 +1,61 @@
|
||||
import numpy
|
||||
import paddle.v2 as paddle
|
||||
|
||||
import mnist_util
|
||||
|
||||
|
||||
def train_reader():
|
||||
train_file = './data/raw_data/train'
|
||||
generator = mnist_util.read_from_mnist(train_file)
|
||||
for item in generator:
|
||||
yield item
|
||||
|
||||
|
||||
def main():
|
||||
paddle.init(use_gpu=False, trainer_count=1)
|
||||
|
||||
# define network topology
|
||||
images = paddle.layer.data(
|
||||
name='pixel', type=paddle.data_type.dense_vector(784))
|
||||
label = paddle.layer.data(
|
||||
name='label', type=paddle.data_type.integer_value(10))
|
||||
hidden1 = paddle.layer.fc(input=images, size=200)
|
||||
hidden2 = paddle.layer.fc(input=hidden1, size=200)
|
||||
inference = paddle.layer.fc(input=hidden2,
|
||||
size=10,
|
||||
act=paddle.activation.Softmax())
|
||||
cost = paddle.layer.classification_cost(input=inference, label=label)
|
||||
|
||||
parameters = paddle.parameters.create(cost)
|
||||
for param_name in parameters.keys():
|
||||
array = parameters.get(param_name)
|
||||
array[:] = numpy.random.uniform(low=-1.0, high=1.0, size=array.shape)
|
||||
parameters.set(parameter_name=param_name, value=array)
|
||||
|
||||
adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01)
|
||||
|
||||
def event_handler(event):
|
||||
if isinstance(event, paddle.event.EndIteration):
|
||||
para = parameters.get('___fc_2__.w0')
|
||||
print "Pass %d, Batch %d, Cost %f, Weight Mean Of Fc 2 is %f" % (
|
||||
event.pass_id, event.batch_id, event.cost, para.mean())
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
trainer = paddle.trainer.SGD(update_equation=adam_optimizer)
|
||||
|
||||
trainer.train(train_data_reader=train_reader,
|
||||
topology=cost,
|
||||
parameters=parameters,
|
||||
event_handler=event_handler,
|
||||
batch_size=32, # batch size should be refactor in Data reader
|
||||
data_types={ # data_types will be removed, It should be in
|
||||
# network topology
|
||||
'pixel': images.type,
|
||||
'label': label.type
|
||||
})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,161 @@
|
||||
# Python Data Reader Design Doc
|
||||
|
||||
At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
|
||||
|
||||
- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
|
||||
- A *reader creator* is a function that returns a reader function.
|
||||
- A *reader* decorator is a function, which accepts one or more readers, and returns a reader.
|
||||
|
||||
and provide frequently used reader creators and reader decorators.
|
||||
|
||||
## Data Reader Interface
|
||||
|
||||
Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
|
||||
|
||||
```
|
||||
iterable = data_reader()
|
||||
```
|
||||
|
||||
Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
|
||||
|
||||
An example implementation for single item data reader creator:
|
||||
|
||||
```python
|
||||
def reader_creator_random_image(width, height):
|
||||
def reader():
|
||||
while True:
|
||||
yield numpy.random.uniform(-1, 1, size=width*height)
|
||||
return reader
|
||||
```
|
||||
|
||||
An example implementation for multiple item data reader creator:
|
||||
```python
|
||||
def reader_creator_random_imageand_label(widht, height, label):
|
||||
def reader():
|
||||
while True:
|
||||
yield numpy.random.uniform(-1, 1, size=width*height), label
|
||||
return reader
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
|
||||
|
||||
```python
|
||||
# two data layer is created:
|
||||
image_layer = paddle.layer.data("image", ...)
|
||||
label_layer = paddle.layer.data("label", ...)
|
||||
|
||||
# ...
|
||||
|
||||
paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...)
|
||||
```
|
||||
|
||||
## Data Reader Decorator
|
||||
|
||||
*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
|
||||
|
||||
Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
|
||||
|
||||
### Prefetch Data
|
||||
|
||||
Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
|
||||
|
||||
Use `paddle.reader.buffered` to prefetch data:
|
||||
|
||||
```python
|
||||
buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100)
|
||||
```
|
||||
|
||||
`buffered_reader` will try to buffer (prefetch) `100` data entries.
|
||||
|
||||
### Compose Multiple Data Readers
|
||||
|
||||
For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
|
||||
|
||||
We can do:
|
||||
|
||||
```python
|
||||
def reader_creator_random_image(width, height):
|
||||
def reader():
|
||||
while True:
|
||||
yield numpy.random.uniform(-1, 1, size=width*height)
|
||||
return reader
|
||||
|
||||
def reader_creator_bool(t):
|
||||
def reader:
|
||||
while True:
|
||||
yield t
|
||||
return reader
|
||||
|
||||
true_reader = reader_creator_bool(True)
|
||||
false_reader = reader_creator_bool(False)
|
||||
|
||||
reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader)
|
||||
# Skipped 1 because paddle.dataset.mnist produces two items per data entry.
|
||||
# And we don't care second item at this time.
|
||||
paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
|
||||
```
|
||||
|
||||
### Shuffle
|
||||
|
||||
Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
|
||||
|
||||
Example:
|
||||
```python
|
||||
reader = paddle.reader.shuffle(paddle.dataset.mnist, 512)
|
||||
```
|
||||
|
||||
## Q & A
|
||||
|
||||
### Why return only a single entry, but not a mini batch?
|
||||
|
||||
If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`.
|
||||
|
||||
Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
|
||||
|
||||
### Why use a dictionary but not a list to provide mapping?
|
||||
|
||||
We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
|
||||
|
||||
### How to create custom data reader creator
|
||||
|
||||
```python
|
||||
def image_reader_creator(image_path, label_path, n):
|
||||
def reader():
|
||||
f = open(image_path)
|
||||
l = open(label_path)
|
||||
images = numpy.fromfile(
|
||||
f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
|
||||
images = images / 255.0 * 2.0 - 1.0
|
||||
labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
|
||||
for i in xrange(n):
|
||||
yield images[i, :], labels[i] # a single entry of data is created each time
|
||||
f.close()
|
||||
l.close()
|
||||
return reader
|
||||
|
||||
# images_reader_creator creates a reader
|
||||
reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
|
||||
paddle.train(reader, {"image":0, "label":1}, ...)
|
||||
```
|
||||
|
||||
### How is `paddle.train` implemented
|
||||
|
||||
An example implementation of paddle.train could be:
|
||||
|
||||
```python
|
||||
def make_minibatch(reader, minibatch_size):
|
||||
def ret():
|
||||
r = reader()
|
||||
buf = [r.next() for x in xrange(minibatch_size)]
|
||||
while len(buf) > 0:
|
||||
yield buf
|
||||
buf = [r.next() for x in xrange(minibatch_size)]
|
||||
return ret
|
||||
|
||||
def train(reader, mapping, batch_size, total_pass):
|
||||
for pass_idx in range(total_pass):
|
||||
for mini_batch in make_minibatch(reader): # this loop will never end in online learning.
|
||||
do_forward_backward(mini_batch, mapping)
|
||||
```
|
@ -0,0 +1,240 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "CosSimOp.h"
|
||||
#include "paddle/math/Matrix.h"
|
||||
#include "paddle/math/Vector.h"
|
||||
|
||||
namespace paddle {
|
||||
/**
|
||||
* Cosine Similarity for CpuMatrix
|
||||
*
|
||||
* \param out_mat, output value, size: nSamples * 1.
|
||||
* \param in1_mat, input value 1, size: nSamples * dim.
|
||||
* \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
|
||||
* \param scale, default 1.0
|
||||
*
|
||||
*/
|
||||
template <>
|
||||
void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
|
||||
const CpuMatrix& in1_mat,
|
||||
const CpuMatrix& in2_mat,
|
||||
real scale) {
|
||||
CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
|
||||
size_t num_samples = out_mat.getHeight();
|
||||
size_t dim = in1_mat.getWidth();
|
||||
/// column vector [nSamples, 1]
|
||||
real* out = out_mat.getData();
|
||||
const real* x = in1_mat.getData();
|
||||
const real* y = in2_mat.getData();
|
||||
|
||||
/// in2 might only have one row or full rows
|
||||
CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
|
||||
size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
|
||||
for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
|
||||
real square_sum_x = 0;
|
||||
real square_sum_y = 0;
|
||||
real xy = 0;
|
||||
for (size_t j = 0; j < dim; ++j) {
|
||||
square_sum_x += x[j] * x[j];
|
||||
square_sum_y += y[j] * y[j];
|
||||
xy += x[j] * y[j];
|
||||
}
|
||||
CHECK(square_sum_x > 0 && square_sum_y > 0);
|
||||
out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cosine Similarity
|
||||
* for each row i,
|
||||
* out[i] = scale * cos(input1[i], input2[i])
|
||||
* = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
|
||||
* when input2 only has one row, then for each row i,
|
||||
* out[i] = cos(input1[i], input2[0])
|
||||
*
|
||||
* \param inputs[0] input matrix 1, size: nSamples * dim.
|
||||
* \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
|
||||
* \param outputs[0] output matrix, size : nSamples * 1.
|
||||
*/
|
||||
|
||||
template <DeviceType Device>
|
||||
class CosSimForwardFunc : public FunctionBase {
|
||||
void init(const FuncConfig& config) override {
|
||||
scale_ = config.get<real>("scale");
|
||||
}
|
||||
|
||||
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
|
||||
CHECK_EQ(inputs.size(), 2UL);
|
||||
CHECK_EQ(outputs.size(), 1UL);
|
||||
|
||||
CHECK_EQ(inputs[0].shape().ndims(), 2UL);
|
||||
CHECK_EQ(inputs[1].shape().ndims(), 2UL);
|
||||
CHECK_EQ(outputs[0].shape().ndims(), 2UL);
|
||||
|
||||
CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
|
||||
CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
|
||||
CHECK_EQ(outputs[0].shape()[1], 1UL);
|
||||
|
||||
CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
|
||||
|
||||
CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
|
||||
auto out_mat = outputs[0].matrix<Device>();
|
||||
const auto in1_mat = inputs[0].matrix<Device>();
|
||||
const auto in2_mat = inputs[1].matrix<Device>();
|
||||
|
||||
CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
|
||||
}
|
||||
|
||||
private:
|
||||
real scale_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Cosine Similarity Derivative for CpuMatrix
|
||||
*
|
||||
* \param in1_grad forward input grad 1, size: nSamples * dim.
|
||||
* \param in2_grad forward input grad 2,
|
||||
* size: n2 * dim (n2 == 1 or n2 == nSamples).
|
||||
*
|
||||
* \param out_grad backward loss output grad, size : nSamples * 1.
|
||||
* \param out_val forward output value, size: nSamples * 1.
|
||||
* \param in1_val forward input value 1, size: nSamples * dim.
|
||||
* \param in2_val forward input value 2,
|
||||
* size: n2 * dim (n2 == 1 or n2 == nSamples).
|
||||
* \param scale, default 1.0
|
||||
*/
|
||||
template <>
|
||||
void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
|
||||
const CpuMatrix& out_val,
|
||||
const CpuMatrix& in1_val,
|
||||
const CpuMatrix& in2_val,
|
||||
CpuMatrix& in1_grad,
|
||||
CpuMatrix& in2_grad,
|
||||
real scale) {
|
||||
CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
|
||||
in2_val.getData() && in1_grad.getData() && in2_grad.getData());
|
||||
CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
|
||||
|
||||
const real* grad = out_grad.getData();
|
||||
const real* out = out_val.getData();
|
||||
const real* prev_out_x = in1_val.getData();
|
||||
const real* prev_out_y = in2_val.getData();
|
||||
real* prev_grad_x = in1_grad.getData();
|
||||
real* prev_grad_y = in2_grad.getData();
|
||||
|
||||
size_t num_samples = out_grad.getHeight();
|
||||
size_t dim = in1_val.getWidth();
|
||||
CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
|
||||
CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
|
||||
size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
|
||||
for (size_t i = 0; i < num_samples; ++i,
|
||||
prev_out_x += dim,
|
||||
prev_out_y += inc,
|
||||
prev_grad_x += dim,
|
||||
prev_grad_y += inc) {
|
||||
real square_sum_x = 0;
|
||||
real square_sum_y = 0;
|
||||
real xy = 0;
|
||||
for (size_t j = 0; j < dim; ++j) {
|
||||
square_sum_x += prev_out_x[j] * prev_out_x[j];
|
||||
square_sum_y += prev_out_y[j] * prev_out_y[j];
|
||||
xy += prev_out_x[j] * prev_out_y[j];
|
||||
}
|
||||
CHECK(square_sum_x > 0 && square_sum_y > 0);
|
||||
if (xy == 0) {
|
||||
real reciprocal =
|
||||
1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
|
||||
for (size_t j = 0; j < dim; ++j) {
|
||||
prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
|
||||
prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
|
||||
}
|
||||
} else {
|
||||
real reciprocal_xy = 1.0f / xy;
|
||||
real reciprocal_square_sum_x = 1.0f / square_sum_x;
|
||||
real reciprocal_square_sum_y = 1.0f / square_sum_y;
|
||||
for (size_t j = 0; j < dim; ++j) {
|
||||
prev_grad_x[j] +=
|
||||
out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
|
||||
prev_out_x[j] * reciprocal_square_sum_x);
|
||||
prev_grad_y[j] +=
|
||||
out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
|
||||
prev_out_y[j] * reciprocal_square_sum_y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cosine Similarity backward Derivative
|
||||
*
|
||||
* \param outputs[0] forward input grad 1, size: nSamples * dim.
|
||||
* \param outputs[1] forward input grad 2,
|
||||
* size: n2 * dim (n2 == 1 or n2 == nSamples).
|
||||
*
|
||||
* \param inputs[0] backward loss output grad, size : nSamples * 1.
|
||||
* \param inputs[1] forward output value, size: nSamples * 1.
|
||||
* \param inputs[2] forward input value 1, size: nSamples * dim.
|
||||
* \param inputs[3] forward input value 2,
|
||||
* size: n2 * dim (n2 == 1 or n2 == nSamples).
|
||||
*/
|
||||
template <DeviceType Device>
|
||||
class CosSimBackwardFunc : public FunctionBase {
|
||||
void init(const FuncConfig& config) override {
|
||||
scale_ = config.get<real>("scale");
|
||||
}
|
||||
|
||||
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
|
||||
CHECK_EQ(inputs.size(), 4UL);
|
||||
CHECK_EQ(outputs.size(), 2UL);
|
||||
/// dim of out_grad and out_val == 1, column vector
|
||||
CHECK_EQ(inputs[0].shape()[1], 1UL);
|
||||
CHECK_EQ(inputs[1].shape()[1], 1UL);
|
||||
/// nSamples of out_grad == out_val == in_val1 == in_grad1
|
||||
CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
|
||||
CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
|
||||
CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
|
||||
/// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
|
||||
CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
|
||||
CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
|
||||
CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
|
||||
|
||||
CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
|
||||
inputs[3].data() && outputs[0].data() && outputs[1].data());
|
||||
|
||||
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
|
||||
CHECK_EQ(outputs[1].getArgType(), ADD_TO);
|
||||
|
||||
const auto out_grad = inputs[0].matrix<Device>();
|
||||
const auto out_val = inputs[1].matrix<Device>();
|
||||
const auto in1_val = inputs[2].matrix<Device>();
|
||||
const auto in2_val = inputs[3].matrix<Device>();
|
||||
auto in1_grad = outputs[0].matrix<Device>();
|
||||
auto in2_grad = outputs[1].matrix<Device>();
|
||||
|
||||
CosSimBackward<Device>(
|
||||
out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
|
||||
}
|
||||
|
||||
private:
|
||||
real scale_;
|
||||
};
|
||||
|
||||
REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
|
||||
REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
|
||||
#ifndef PADDLE_ONLY_CPU
|
||||
REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
|
||||
REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
|
||||
#endif
|
||||
} // namespace paddle
|
@ -0,0 +1,61 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Function.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief Cosine Similarity Forward.
|
||||
* for each row i,
|
||||
* out[i] = scale * cos(in1[i], in2[i])
|
||||
* = scale * \sum_j (in1[i][j] * in2[i][j]) /
|
||||
* sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
|
||||
*
|
||||
* \param[out] output output value.
|
||||
* \param[in] intput1 input value.
|
||||
* \param[in] intput2 input value.
|
||||
* \param[in] scale default 1.0.
|
||||
*
|
||||
*/
|
||||
template <DeviceType Device>
|
||||
void CosSimForward(typename Tensor<real, Device>::Matrix& output,
|
||||
const typename Tensor<real, Device>::Matrix& input1,
|
||||
const typename Tensor<real, Device>::Matrix& input2,
|
||||
real scale);
|
||||
|
||||
/**
|
||||
* \brief Cosine Similarity BackWard for Derivative.
|
||||
*
|
||||
* \param[in] output grad backward loss output grad.
|
||||
* \param[in] output val forward-output value.
|
||||
* \param[in] input val1 forward input value 1.
|
||||
* \param[in] input val2 forward input value 2.
|
||||
* \param[in/out] input grad forward input grad 1.
|
||||
* \param[in/out] input grad forward input grad 2.
|
||||
* \param[in] scale default 1.0.
|
||||
*
|
||||
*/
|
||||
template <DeviceType Device>
|
||||
void CosSimBackward(const typename Tensor<real, Device>::Matrix& out_grad,
|
||||
const typename Tensor<real, Device>::Matrix& out_value,
|
||||
const typename Tensor<real, Device>::Matrix& in1_value,
|
||||
const typename Tensor<real, Device>::Matrix& in2_value,
|
||||
typename Tensor<real, Device>::Matrix& in1_grad,
|
||||
typename Tensor<real, Device>::Matrix& in2_grad,
|
||||
real scale);
|
||||
|
||||
} // namespace paddle
|
@ -0,0 +1,241 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "hl_base.h"
|
||||
#include "hl_device_functions.cuh"
|
||||
#include "CosSimOp.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
template<int block_size>
|
||||
__global__ void KeCosSim(real* output,
|
||||
const real* input1,
|
||||
const real* input2,
|
||||
int width,
|
||||
int input1_height,
|
||||
int input2_height,
|
||||
real scale) {
|
||||
const int ty = blockIdx.y;
|
||||
int tid = threadIdx.x;
|
||||
|
||||
__shared__ real xx[block_size];
|
||||
__shared__ real yy[block_size];
|
||||
__shared__ real xy[block_size];
|
||||
|
||||
xx[tid] = 0.0;
|
||||
yy[tid] = 0.0;
|
||||
xy[tid] = 0.0;
|
||||
__syncthreads();
|
||||
|
||||
input1 += ty * width;
|
||||
if (input2_height > 1) {
|
||||
input2 += ty * width;
|
||||
}
|
||||
for (int index = tid; index < width; index += block_size) {
|
||||
real x = input1[index];
|
||||
real y = input2[index];
|
||||
xx[tid] += x * x;
|
||||
yy[tid] += y * y;
|
||||
xy[tid] += x * y;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int s = block_size / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
xx[tid] += xx[tid + s];
|
||||
yy[tid] += yy[tid + s];
|
||||
xy[tid] += xy[tid + s];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (tid == 0) {
|
||||
output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
|
||||
}
|
||||
}
|
||||
|
||||
void hlCossim(real* output,
|
||||
const real* input1,
|
||||
const real* input2,
|
||||
size_t width,
|
||||
size_t input1_height,
|
||||
size_t input2_height,
|
||||
real scale) {
|
||||
CHECK_NOTNULL(output);
|
||||
CHECK_NOTNULL(input1);
|
||||
CHECK_NOTNULL(input2);
|
||||
const int block_size = 256;
|
||||
dim3 threads(block_size, 1);
|
||||
dim3 grid(1, input1_height);
|
||||
|
||||
KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
|
||||
(output, input1, input2, width, input1_height, input2_height, scale);
|
||||
CHECK_SYNC("hlCossim failed");
|
||||
}
|
||||
|
||||
template <>
|
||||
void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
|
||||
const GpuMatrix& in1_mat,
|
||||
const GpuMatrix& in2_mat,
|
||||
real scale) {
|
||||
CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
|
||||
CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true)
|
||||
<< "Matrix type are not GPU";
|
||||
|
||||
size_t num_samples = out_mat.getHeight();
|
||||
size_t dim = in1_mat.getWidth();
|
||||
real* out = out_mat.getData();
|
||||
const real* x = in1_mat.getData();
|
||||
const real* y = in2_mat.getData();
|
||||
hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
|
||||
}
|
||||
|
||||
template<int block_size>
|
||||
__global__ void KeCosSimDerivative(const real* grad,
|
||||
const real* output,
|
||||
const real* prev_out_x,
|
||||
const real* prev_out_y,
|
||||
real* prev_grad_x,
|
||||
real* prev_grad_y,
|
||||
size_t width,
|
||||
size_t input1_height,
|
||||
size_t input2_height,
|
||||
real scale) {
|
||||
const int ty = blockIdx.y;
|
||||
int tid = threadIdx.x;
|
||||
|
||||
__shared__ real xx[block_size];
|
||||
__shared__ real yy[block_size];
|
||||
__shared__ real xy[block_size];
|
||||
|
||||
xx[tid] = 0.0;
|
||||
yy[tid] = 0.0;
|
||||
xy[tid] = 0.0;
|
||||
__syncthreads();
|
||||
|
||||
prev_out_x += ty * width;
|
||||
prev_grad_x += ty * width;
|
||||
if (input2_height > 1) {
|
||||
prev_out_y += ty * width;
|
||||
prev_grad_y += ty * width;
|
||||
}
|
||||
for (int index = tid; index < width; index += block_size) {
|
||||
real x = prev_out_x[index];
|
||||
real y = prev_out_y[index];
|
||||
xx[tid] += x * x;
|
||||
yy[tid] += y * y;
|
||||
xy[tid] += x * y;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int s = block_size / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
xx[tid] += xx[tid + s];
|
||||
yy[tid] += yy[tid + s];
|
||||
xy[tid] += xy[tid + s];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (xy[0] == 0) {
|
||||
real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
|
||||
for (int index = tid; index < width; index += block_size) {
|
||||
prev_grad_x[index] +=
|
||||
scale * grad[ty] * prev_out_y[index] * reciprocal;
|
||||
if (input2_height > 1) {
|
||||
prev_grad_y[index] +=
|
||||
scale * grad[ty] * prev_out_x[index] * reciprocal;
|
||||
} else {
|
||||
paddle::paddleAtomicAdd(prev_grad_y + index,
|
||||
scale * grad[ty] * prev_out_x[index] * reciprocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
real reciprocalXY = 1.0 / xy[0];
|
||||
real reciprocalSquareSumX = 1.0 / xx[0];
|
||||
real reciprocalSquareSumY = 1.0 / yy[0];
|
||||
for (int index = tid; index < width; index += block_size) {
|
||||
prev_grad_x[index] += output[ty] * grad[ty] *
|
||||
(prev_out_y[index] * reciprocalXY -
|
||||
prev_out_x[index] * reciprocalSquareSumX);
|
||||
if (input2_height > 1) {
|
||||
prev_grad_y[index] += output[ty] * grad[ty] *
|
||||
(prev_out_x[index] * reciprocalXY -
|
||||
prev_out_y[index] * reciprocalSquareSumY);
|
||||
} else {
|
||||
paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
|
||||
(prev_out_x[index] * reciprocalXY -
|
||||
prev_out_y[index] * reciprocalSquareSumY));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void hlCossimDerivative(const real* grad,
|
||||
const real* output,
|
||||
const real* prev_out_x,
|
||||
const real* prev_out_y,
|
||||
real* prev_grad_x,
|
||||
real* prev_grad_y,
|
||||
size_t width,
|
||||
size_t input1_height,
|
||||
size_t input2_height,
|
||||
real scale) {
|
||||
CHECK_NOTNULL(grad);
|
||||
CHECK_NOTNULL(output);
|
||||
CHECK_NOTNULL(prev_out_x);
|
||||
CHECK_NOTNULL(prev_out_y);
|
||||
CHECK_NOTNULL(prev_grad_x);
|
||||
CHECK_NOTNULL(prev_grad_y);
|
||||
const int block_size = 256;
|
||||
dim3 threads(block_size, 1);
|
||||
dim3 grid(1, input1_height);
|
||||
KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
|
||||
(grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
|
||||
input1_height, input2_height, scale);
|
||||
CHECK_SYNC("hlCossimDerivate failed");
|
||||
}
|
||||
|
||||
template <>
|
||||
void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
|
||||
const GpuMatrix& out_val,
|
||||
const GpuMatrix& in1_val,
|
||||
const GpuMatrix& in2_val,
|
||||
GpuMatrix& in1_grad,
|
||||
GpuMatrix& in2_grad,
|
||||
real scale) {
|
||||
CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
|
||||
in2_val.getData() && in1_grad.getData() && in2_grad.getData());
|
||||
CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
|
||||
&& in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
|
||||
<< "Matrix types are not equally GPU";
|
||||
|
||||
size_t dim = in1_val.getWidth();
|
||||
const real* grad = out_grad.getData();
|
||||
const real* out = out_val.getData();
|
||||
const real* prev_out_x = in1_val.getData();
|
||||
const real* prev_out_y = in2_val.getData();
|
||||
real* prev_grad_x = in1_grad.getData();
|
||||
real* prev_grad_y = in2_grad.getData();
|
||||
hlCossimDerivative(grad,
|
||||
out,
|
||||
prev_out_x,
|
||||
prev_out_y,
|
||||
prev_grad_x,
|
||||
prev_grad_y,
|
||||
dim,
|
||||
in1_val.getHeight(),
|
||||
in2_val.getHeight(),
|
||||
scale);
|
||||
}
|
||||
|
||||
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue