Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into clip_op

update-doc-pybind
wanghaoshuang 8 years ago
commit a3c3b7866e

1
.gitignore vendored

@ -27,3 +27,4 @@ CMakeFiles
cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
paddle/pybind/pybind.h

File diff suppressed because it is too large Load Diff

@ -22,10 +22,10 @@ limitations under the License. */
*/
typedef enum {
HL_POOLING_MAX = 0,
// average includes padded values
HL_POOLING_AVERAGE = 1,
// average does not include padded values
HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2,
HL_POOLING_AVERAGE = 1,
// average includes padded values
HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
HL_POOLING_END
} hl_pooling_mode_t;

@ -461,7 +461,7 @@ class add<float32x4_t> {
public:
INLINE float32x4_t operator()(const float32x4_t a,
const float32x4_t b) const {
return vmulq_f32(a, b);
return vaddq_f32(a, b);
}
};

@ -211,13 +211,11 @@ __global__ void KeAvgPoolForward(const int nthreads,
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int pool_size = (hend - hstart) * (wend - wstart);
int hend = min(hstart + sizeY, height);
int wend = min(wstart + sizeX, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, height);
wend = min(wend, width);
int pool_size = (hend - hstart) * (wend - wstart);
real aveval = 0;
inputData += (frameNum * channels + c) * height * width;
@ -299,12 +297,14 @@ __global__ void KeAvgPoolBackward(const int nthreads,
outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
for (int ph = phstart; ph < phend; ++ph) {
int hstart = ph * strideH - padH;
int hend = min(hstart + sizeY, height);
hstart = max(hstart, 0);
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
int poolsize = (hend - hstart) * (wend - wstart);
gradient += outGrad[ph * pooledW + pw] / poolsize;
}
@ -600,16 +600,13 @@ __global__ void KeAvgPool3DForward(const int nthreads,
int dstart = pd * strideD - padD;
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int dend = min(dstart + sizeZ, depth + padD);
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
int dend = min(dstart + sizeZ, depth);
int hend = min(hstart + sizeY, height);
int wend = min(wstart + sizeX, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dend = min(dend, depth);
hend = min(hend, height);
wend = min(wend, width);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
real aveval = 0;
inputData += (frameNum * channels + c) * depth * height * width;
@ -712,15 +709,18 @@ __global__ void KeAvgPool3DBackward(const int nthreads,
outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
for (int pd = pdstart; pd < pdend; ++pd) {
int dstart = pd * strideD - padD;
int dend = min(dstart + sizeZ, depth);
dstart = max(dstart, 0);
for (int ph = phstart; ph < phend; ++ph) {
int hstart = ph * strideH - padH;
int hend = min(hstart + sizeY, height);
hstart = max(hstart, 0);
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int dstart = pd * strideD - padD;
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int dend = min(dstart + sizeZ, depth + padD);
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
}

@ -432,11 +432,11 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
cudnn_mode = CUDNN_POOLING_MAX;
break;
case HL_POOLING_AVERAGE:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
case HL_POOLING_AVERAGE_INCLUDE_PADDING:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
default:
LOG(FATAL) << "parameter mode error";
}

@ -22,14 +22,14 @@ namespace framework {
template <>
Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const {
return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
}
#ifndef PADDLE_ONLY_CPU
template <>
Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return *device_context_->get_eigen_device<Eigen::GpuDevice>();
return *device_context_.get_eigen_device<Eigen::GpuDevice>();
}
#endif

@ -366,7 +366,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
class ExecutionContext : public InferShapeContext {
public:
ExecutionContext(const OperatorBase& op, const Scope& scope,
const platform::DeviceContext* device_context)
const platform::DeviceContext& device_context)
: InferShapeContext(op, scope), device_context_(device_context) {}
template <typename PlaceType,
@ -374,9 +374,9 @@ class ExecutionContext : public InferShapeContext {
typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
DeviceType& GetEigenDevice() const;
platform::Place GetPlace() const { return device_context_->GetPlace(); }
platform::Place GetPlace() const { return device_context_.GetPlace(); }
const platform::DeviceContext* device_context() const {
const platform::DeviceContext& device_context() const {
return device_context_;
}
@ -401,7 +401,8 @@ class ExecutionContext : public InferShapeContext {
return res;
}
const platform::DeviceContext* device_context_;
private:
const platform::DeviceContext& device_context_;
};
template <>
@ -461,7 +462,7 @@ class OperatorWithKernel : public OperatorBase {
void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const final {
auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
}
static std::unordered_map<std::string /* op_type */, OpKernelMap>&

@ -29,9 +29,9 @@ bool CudnnPoolLayer::typeCheck(const std::string &poolType,
if (mode) {
*mode = HL_POOLING_AVERAGE;
}
} else if (poolType == "cudnn-avg-excl-pad-pool") {
} else if (poolType == "cudnn-avg-incl-pad-pool") {
if (mode) {
*mode = HL_POOLING_AVERAGE_EXCLUDE_PADDING;
*mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
}
} else {
return false;

@ -143,7 +143,7 @@ void DetectionOutputLayer::forward(PassType passType) {
resetOutput(numKept, 7);
} else {
MatrixPtr outV = getOutputValue();
outV = NULL;
if (outV) outV->resize(0, 0);
return;
}
MatrixPtr outV = getOutputValue();

File diff suppressed because it is too large Load Diff

@ -0,0 +1,138 @@
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
typedef mkldnn::pooling_forward pool_fwd;
typedef mkldnn::pooling_backward pool_bwd;
/**
* @brief A subclass of MKLDNNLayer pool layer.
*
* The config file api is mkldnn_pool
*/
class MKLDNNPoolLayer : public MKLDNNLayer {
protected:
// padding height and width
int ph_, pw_;
// stride height and width
int sh_, sw_;
// filter(kenerl) height and width
int fh_, fw_;
// pooling_avg or pooling_max
mkldnn::algorithm poolAlgo_;
// MKLDNNMatrixPtr which should be created from CPU Device
MKLDNNMatrixPtr cpuOutVal_;
MKLDNNMatrixPtr cpuOutGrad_;
// convert handle between CPU device and MKLDNN device
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// save forward primitive_desc, which can be used backward
std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
// according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
// test_pooling_forward.cpp, pool need workspace for backward
std::shared_ptr<mkldnn::memory> workspace_;
public:
explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
~MKLDNNPoolLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void printSizeInfo() override {
MKLDNNLayer::printSizeInfo();
VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
<< ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
<< ", sw: " << sw_;
}
protected:
/**
* Forward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetInValue(MKLDNNMatrixPtr& in);
void resetOutValue(MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetOutGrad(MKLDNNMatrixPtr& out);
void resetInGrad(MKLDNNMatrixPtr& in);
void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
/**
* get padding_r according to
* https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
* test_pooling_forward.cpp
*/
mkldnn::memory::dims getPaddingR() const {
mkldnn::memory::dims padR = {ph_, pw_};
for (int i = 0; i < 2; ++i) {
if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
++padR[0];
}
if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
++padR[1];
}
}
return padR;
}
};
} // namespace paddle

@ -141,6 +141,68 @@ TEST(MKLDNNLayer, ConvLayer) {
testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
}
struct testPoolDesc {
int bs, ch; // input channel and output channel are the same
int ih, iw;
int oh, ow;
int fh, fw;
int ph, pw;
int sh, sw;
};
void testPoolLayer(const testPoolDesc& pm) {
const std::string compareTypes[] = {"mkldnn_pool", "pool"};
TestConfig cfg;
cfg.layerConfig.set_type(compareTypes[0]);
cfg.layerConfig.set_size(pm.ch * pm.oh * pm.ow);
cfg.inputDefs.push_back(
{INPUT_DATA,
"layer_0",
/* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
PoolConfig* pool = input->mutable_pool_conf();
// pool->set_pool_type(poolType);
pool->set_channels(pm.ch);
pool->set_img_size(pm.iw);
pool->set_img_size_y(pm.ih);
pool->set_output_x(pm.ow);
pool->set_output_y(pm.oh);
pool->set_size_x(pm.fw);
pool->set_size_y(pm.fh);
pool->set_padding(pm.pw);
pool->set_padding_y(pm.ph);
pool->set_stride(pm.sw);
pool->set_stride_y(pm.sh);
int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
CHECK_EQ(ow, pm.ow) << "output size check failed";
CHECK_EQ(oh, pm.oh) << "output size check failed";
MKLDNNTester tester;
for (auto type : {"max-projection", "avg-projection"}) {
pool->set_pool_type(type);
TestConfig ref = cfg;
ref.layerConfig.set_type(compareTypes[1]);
for (auto bs : {pm.bs, 1}) {
tester.run(cfg, ref, bs, pm.ih, pm.iw);
}
}
}
TEST(MkldnnLayer, PoolLayer) {
/* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw*/
testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
}
// TODO(TJ): add branch test
int main(int argc, char** argv) {

@ -17,6 +17,7 @@ limitations under the License. */
#include <cmath>
#include "BaseMatrix.h"
#include "MathFunctions.h"
#include "NEONFunctions.h"
#include "SIMDFunctions.h"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_base.cuh"
@ -666,6 +667,13 @@ void BaseMatrixT<T>::relu(BaseMatrixT& b) {
applyBinary(binary::Relu<T>(), b);
}
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
void BaseMatrixT<float>::relu(BaseMatrixT& b) {
neon::relu(data_, b.data_, height_ * width_);
}
#endif
DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
template <class T>
void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {

File diff suppressed because it is too large Load Diff

@ -0,0 +1,55 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "NEONFunctions.h"
#include <arm_neon.h>
namespace paddle {
namespace neon {
// b[i] = a[i] > 0.0f ? a[i] : 0.0f
void relu(const float* a, float* b, int len) {
int offset = len % 16;
float32x4_t ma0, ma1, ma2, ma3;
float32x4_t mb0, mb1, mb2, mb3;
float32x4_t zero = vdupq_n_f32(0.f);
for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = vld1q_f32(a);
ma1 = vld1q_f32(a + 4);
ma2 = vld1q_f32(a + 8);
ma3 = vld1q_f32(a + 12);
mb0 = vmaxq_f32(ma0, zero);
mb1 = vmaxq_f32(ma1, zero);
mb2 = vmaxq_f32(ma2, zero);
mb3 = vmaxq_f32(ma3, zero);
vst1q_f32(b, mb0);
vst1q_f32(b + 4, mb1);
vst1q_f32(b + 8, mb2);
vst1q_f32(b + 12, mb3);
}
for (int i = 0; i < offset; i++) {
b[i] = a[i] > 0.0f ? a[i] : 0.0f;
}
}
} // namespace neon
} // namespace paddle
#endif

@ -0,0 +1,23 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace neon {
void relu(const float* a, float* b, int len);
} // namespace neon
} // namespace paddle

@ -825,9 +825,8 @@ void testMaxPoolFwdBwd(int numSamples,
int strideW,
int padH,
int padW) {
int outH = 0, outW = 0;
outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
int inWidth = imgSizeH * imgSizeW * channels;
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
@ -927,9 +926,8 @@ void testAvgPoolFwdBwd(int numSamples,
int strideW,
int padH,
int padW) {
int outH = 0, outW = 0;
outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
int inWidth = imgSizeH * imgSizeW * channels;
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);

@ -0,0 +1,147 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/cross_entropy_op.h"
namespace paddle {
namespace operators {
using framework::LoDTensor;
class CrossEntropyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
"Input(Label) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null.");
auto x = ctx.Input<Tensor>("X");
auto label = ctx.Input<Tensor>("Label");
PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
PADDLE_ENFORCE_EQ(label->dims().size(), 2,
"Input(Label)'s rank must be 2.");
// TODO(xinghai-sun): remove this check after swtiching to bool
PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
ctx.Attr<int>("soft_label") == 1);
PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
"The 1st dimension of Input(X) and Input(Label) must "
"be equal.");
if (ctx.Attr<int>("soft_label") == 1) {
PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
"If Attr(soft_label) == 1, The 2nd dimension of "
"Input(X) and Input(Label) must be equal.");
} else {
PADDLE_ENFORCE_EQ(label->dims()[1], 1,
"If Attr(soft_label) == 0, The 2nd dimension of "
"Input(Label) must be 1.");
}
ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
}
};
class CrossEntropyGradientOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
"Input(Label) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
"Input(Y@GRAD) must not be null.");
auto x = ctx.Input<Tensor>("X");
auto label = ctx.Input<Tensor>("Label");
auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
PADDLE_ENFORCE_EQ(label->dims().size(), 2,
"Input(Label)'s rank must be 2.");
// TODO(xinghai-sun): remove this check after swtiching to bool
PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
ctx.Attr<int>("soft_label") == 1);
PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
"The 1st dimension of Input(X) and Input(Label) must "
"be equal.");
PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0],
"The 1st dimension of Input(X) and Input(Y@Grad) must "
"be equal.");
PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
"The 2nd dimension of Input(Y@Grad) must be 1.");
if (ctx.Attr<int>("soft_label") == 1) {
PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
"If Attr(soft_label) == 1, The 2nd dimension of "
"Input(X) and Input(Label) must be equal.");
} else {
PADDLE_ENFORCE_EQ(label->dims()[1], 1,
"If Attr(soft_label) == 0, The 2nd dimension of "
"Input(Label) must be 1.");
}
auto dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
dx->Resize(x->dims());
}
};
class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
CrossEntropyOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of CrossEntropyOp");
AddInput("Label", "The second input of CrossEntropyOp");
AddOutput("Y", "The output of CrossEntropyOp");
AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
AddComment(R"DOC(
CrossEntropy Operator.
It supports both standard cross-entropy and soft-label cross-entropy loss
computation.
1) One-hot cross-entropy:
soft_label = 0, Label[i, 0] indicates the class index for sample i:
Y[i] = -log(X[i, Label[i]])
2) Soft-label cross-entropy:
soft_label = 1, Label[i, j] indicates the soft label of class j
for sample i:
Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
Please make sure that in this case the summuation of each row of Label
equals one.
3) One-hot cross-entropy with vecterized Input(Label):
As a special case of 2), when each row of Input(Label) has only one
non-zero element (equals 1), soft-label cross-entropy degenerates to a
one-hot cross-entropy with one-hot label representation.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>);
REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<float>);

@ -0,0 +1,158 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/cross_entropy_op.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
const int N, const int D) {
// TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
// CUDA_1D_KERNEL_LOOP(i, N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
Y[i] = -tolerable_value(log(X[i * D + label[i]]));
}
}
template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int N, const int D) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
T sum = static_cast<T>(0);
for (int j = 0; j < D; j++) {
sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
}
Y[i] = -sum;
}
}
// TODO(qingqing): make zero setting an common function.
template <typename T>
__global__ void zero(T* X, const int N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
X[i] = 0.0;
}
}
template <typename T>
__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const int* label, const int N,
const int D) {
// TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
// CUDA_1D_KERNEL_LOOP(i, N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
int idx = i * D + label[i];
dX[idx] = -dY[i] / X[idx];
}
}
template <typename T>
__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const T* label, const int N,
const int D) {
// TOOD(qingqing): optimize for this kernel
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
for (int j = 0; j < D; ++j) {
int idx = i * D + j;
dX[idx] = -label[idx] * dY[i] / X[idx];
}
}
}
template <typename T>
class CrossEntropyOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto x = ctx.Input<Tensor>("X");
auto y = ctx.Output<Tensor>("Y");
auto label = ctx.Input<Tensor>("Label");
auto* x_data = x->data<T>();
y->mutable_data<T>(ctx.GetPlace());
auto* y_data = y->data<T>();
int n = x->dims()[0];
int d = x->dims()[1];
int block = 512;
int grid = (n + block - 1) / block;
// TODO(qingqing) launch kernel on specified stream
// base on ExecutionContext.
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
d);
} else {
auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
}
}
};
template <typename T>
class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto x = ctx.Input<Tensor>("X");
auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("Label");
auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
auto* dy_data = dy->data<T>();
auto* x_data = x->data<T>();
int n = x->dims()[0];
int d = x->dims()[1];
int block = 512;
int grid = (n * d + block - 1) / block;
zero<T><<<grid, block>>>(dx_data, n * d);
grid = (n + block - 1) / block;
// TODO(qingqing): launch kernel on specified stream
// base on ExecutionContext.
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = label->data<T>();
SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
dx_data, dy_data, x_data, label_data, n, d);
} else {
auto* label_data = label->data<int>();
CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
label_data, n, d);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpCUDAKernel<float>);

@ -0,0 +1,117 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
HOSTDEVICE T tolerable_value(const T x) {
PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20;
if (x == INFINITY) {
return kApproInf;
}
if (x == -INFINITY) {
return -kApproInf;
}
return x;
}
template <typename T>
class CrossEntropyOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto x = ctx.Input<Tensor>("X");
auto y = ctx.Output<Tensor>("Y");
auto* x_data = x->data<T>();
y->mutable_data<T>(ctx.GetPlace());
auto* y_data = y->data<T>();
int batch_size = x->dims()[0];
int class_num = x->dims()[1];
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
int index = 0;
for (int i = 0; i < batch_size; ++i) {
T sum = static_cast<T>(0);
for (int j = 0; j < class_num; ++j) {
sum += label_data[index] * tolerable_value(std::log(x_data[index]));
y_data[i] = -sum;
index++;
}
}
} else {
auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
y_data[i] = -tolerable_value(std::log(x_data[index]));
}
}
}
};
template <typename T>
class CrossEntropyGradientOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto x = ctx.Input<Tensor>("X");
auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("Label");
auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
auto* dy_data = dy->data<T>();
auto* x_data = x->data<T>();
int batch_size = x->dims()[0];
int class_num = x->dims()[1];
// TODO(qingqing): make zero setting an common function.
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
int index = 0;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < class_num; ++j) {
dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
index++;
}
}
} else {
auto* label_data = label->data<int>();
memset(dx_data, 0, sizeof(T) * batch_size * class_num);
for (int i = 0; i < batch_size; ++i) {
PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
int index = i * class_num + label_data[i];
dx_data[index] = -dy_data[i] / x_data[index];
}
}
}
};
} // namespace operators
} // namespace paddle

@ -0,0 +1,113 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/dropout_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
using framework::LoDTensor;
class DropoutOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
// TODO(xinghai-sun): remove this check after swtiching to bool
PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
ctx.Attr<int>("is_training") == 1);
auto dims = ctx.Input<Tensor>("X")->dims();
ctx.Output<LoDTensor>("Out")->Resize(dims);
if (ctx.Attr<int>("is_training") == 1) {
ctx.Output<LoDTensor>("Mask")->Resize(dims);
}
}
};
template <typename AttrType>
class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
public:
DropoutOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
.SetDefault(.5f);
// TODO(xinghai-sun): use bool for is_training after bool is supported.
AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
AddInput("X", "The input of dropout op.");
AddOutput("Out", "The output of dropout op.");
AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
AddComment(R"DOC(
Dropout Operator.
"Dropout" refers to randomly dropping out units in a nerual network. It is a
regularization technique for reducing overfitting by preventing neuron
co-adaption during training. The dropout operator randomly set (according to
the given dropout probability) the outputs of some units to zero, while others
being set to their inputs.
)DOC");
}
};
template <typename AttrType>
class DropoutOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
"GradOp is only callable when is_training is true");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
"Input(Out@GRAD) must not be null.");
PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
// TODO(xinghai-sun): remove this check after swtiching to bool
PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
ctx.Attr<int>("is_training") == 1);
auto x_dims = ctx.Input<Tensor>("X")->dims();
auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
PADDLE_ENFORCE_EQ(x_dims, out_dims,
"Dimensions of Input(X) and Out@Grad must be the same.");
auto mask_dims = ctx.Input<Tensor>("Mask")->dims();
PADDLE_ENFORCE_EQ(x_dims, mask_dims,
"Dimensions of Input(X) and Mask must be the same.");
auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
x_grad->Resize(x_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
ops::DropoutOpGrad<float>);
REGISTER_OP_CPU_KERNEL(
dropout, ops::CPUDropoutKernel<paddle::platform::CPUPlace, float, float>);
REGISTER_OP_CPU_KERNEL(
dropout_grad, ops::DropoutGradKernel<paddle::platform::CPUPlace, float>);

@ -0,0 +1,86 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include <thrust/device_ptr.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/operators/dropout_op.h"
namespace paddle {
namespace operators {
template <typename T, typename AttrType>
struct MaskGenerator {
AttrType dropout_prob;
int seed;
__host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
: dropout_prob(dropout_prob), seed(seed) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed);
thrust::uniform_real_distribution<AttrType> dist(0, 1);
rng.discard(n);
if (dist(rng) < dropout_prob) {
return static_cast<T>(0);
} else {
return static_cast<T>(1);
}
}
};
// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
template <typename Place, typename T, typename AttrType>
class GPUDropoutKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* y = context.Output<Tensor>("Out");
y->mutable_data<T>(context.GetPlace());
AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto place = context.GetEigenDevice<Place>();
if (context.Attr<int>("is_training") == 1) {
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int size = framework::product(mask->dims());
int seed = context.Attr<int>("seed");
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(mask_data),
MaskGenerator<T, AttrType>(dropout_prob, seed));
auto M = EigenMatrix<T>::Reshape(*mask, 1);
Y.device(place) = X * M;
} else {
Y.device(place) = X * dropout_prob;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
dropout, ops::GPUDropoutKernel<paddle::platform::GPUPlace, float, float>);
REGISTER_OP_GPU_KERNEL(
dropout_grad, ops::DropoutGradKernel<paddle::platform::GPUPlace, float>);

@ -0,0 +1,86 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <random>
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T, typename AttrType>
class CPUDropoutKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* y = context.Output<Tensor>("Out");
const auto* x_data = x->data<T>();
auto* y_data = y->mutable_data<T>(context.GetPlace());
AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
if (context.Attr<int>("is_training") == 1) {
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int seed = context.Attr<int>("seed");
std::minstd_rand engine;
engine.seed(seed);
std::uniform_real_distribution<AttrType> dist(0, 1);
size_t size = framework::product(mask->dims());
for (size_t i = 0; i < size; ++i) {
if (dist(engine) < dropout_prob) {
mask_data[i] = 0;
y_data[i] = 0;
} else {
mask_data[i] = 1;
y_data[i] = x_data[i];
}
}
} else {
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto place = context.GetEigenDevice<Place>();
Y.device(place) = X * dropout_prob;
}
}
};
template <typename Place, typename T>
class DropoutGradKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
"GradOp is only callable when is_training is true");
auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
auto* mask = context.Input<Tensor>("Mask");
grad_x->mutable_data<T>(context.GetPlace());
auto M = EigenMatrix<T>::Reshape(*mask, 1);
auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
auto place = context.GetEigenDevice<Place>();
dX.device(place) = dY * M;
}
};
} // namespace operators
} // namespace paddle

@ -19,12 +19,13 @@ namespace operators {
namespace math {
template <>
void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const float alpha, const float* A,
const float* B, const float beta, float* C,
platform::DeviceContext* context) {
const float* B, const float beta,
float* C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
@ -33,13 +34,13 @@ void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
}
template <>
void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const double alpha, const double* A,
const double* B, const double beta,
double* C,
platform::DeviceContext* context) {
double* C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
@ -48,13 +49,10 @@ void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
}
template <>
void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, float alpha,
framework::Tensor* matrix_out,
float beta,
platform::DeviceContext* context) {
void matmul<platform::CPUPlace, float>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
framework::Tensor* matrix_out, float beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
@ -74,18 +72,15 @@ void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CPUPlace, float>(
transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
}
template <>
void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, double alpha,
framework::Tensor* matrix_out,
double beta,
platform::DeviceContext* context) {
void matmul<platform::CPUPlace, double>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
framework::Tensor* matrix_out, double beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
@ -105,8 +100,8 @@ void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CPUPlace, double>(
transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
} // namespace math

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save