Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into crop_op

update-doc-pybind
wanghaoshuang 8 years ago
commit fa4908dc10

1
.gitignore vendored

@ -27,3 +27,4 @@ CMakeFiles
cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
paddle/pybind/pybind.h

File diff suppressed because it is too large Load Diff

@ -22,10 +22,10 @@ limitations under the License. */
*/
typedef enum {
HL_POOLING_MAX = 0,
// average includes padded values
HL_POOLING_AVERAGE = 1,
// average does not include padded values
HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2,
HL_POOLING_AVERAGE = 1,
// average includes padded values
HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
HL_POOLING_END
} hl_pooling_mode_t;

@ -461,7 +461,7 @@ class add<float32x4_t> {
public:
INLINE float32x4_t operator()(const float32x4_t a,
const float32x4_t b) const {
return vmulq_f32(a, b);
return vaddq_f32(a, b);
}
};

@ -211,13 +211,11 @@ __global__ void KeAvgPoolForward(const int nthreads,
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int pool_size = (hend - hstart) * (wend - wstart);
int hend = min(hstart + sizeY, height);
int wend = min(wstart + sizeX, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, height);
wend = min(wend, width);
int pool_size = (hend - hstart) * (wend - wstart);
real aveval = 0;
inputData += (frameNum * channels + c) * height * width;
@ -299,12 +297,14 @@ __global__ void KeAvgPoolBackward(const int nthreads,
outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
for (int ph = phstart; ph < phend; ++ph) {
int hstart = ph * strideH - padH;
int hend = min(hstart + sizeY, height);
hstart = max(hstart, 0);
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
int poolsize = (hend - hstart) * (wend - wstart);
gradient += outGrad[ph * pooledW + pw] / poolsize;
}
@ -600,16 +600,13 @@ __global__ void KeAvgPool3DForward(const int nthreads,
int dstart = pd * strideD - padD;
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int dend = min(dstart + sizeZ, depth + padD);
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
int dend = min(dstart + sizeZ, depth);
int hend = min(hstart + sizeY, height);
int wend = min(wstart + sizeX, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dend = min(dend, depth);
hend = min(hend, height);
wend = min(wend, width);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
real aveval = 0;
inputData += (frameNum * channels + c) * depth * height * width;
@ -712,15 +709,18 @@ __global__ void KeAvgPool3DBackward(const int nthreads,
outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
for (int pd = pdstart; pd < pdend; ++pd) {
int dstart = pd * strideD - padD;
int dend = min(dstart + sizeZ, depth);
dstart = max(dstart, 0);
for (int ph = phstart; ph < phend; ++ph) {
int hstart = ph * strideH - padH;
int hend = min(hstart + sizeY, height);
hstart = max(hstart, 0);
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int dstart = pd * strideD - padD;
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int dend = min(dstart + sizeZ, depth + padD);
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
}

@ -432,11 +432,11 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
cudnn_mode = CUDNN_POOLING_MAX;
break;
case HL_POOLING_AVERAGE:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
case HL_POOLING_AVERAGE_INCLUDE_PADDING:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
default:
LOG(FATAL) << "parameter mode error";
}

@ -22,14 +22,14 @@ namespace framework {
template <>
Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const {
return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
}
#ifndef PADDLE_ONLY_CPU
template <>
Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return *device_context_->get_eigen_device<Eigen::GpuDevice>();
return *device_context_.get_eigen_device<Eigen::GpuDevice>();
}
#endif

@ -366,7 +366,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
class ExecutionContext : public InferShapeContext {
public:
ExecutionContext(const OperatorBase& op, const Scope& scope,
const platform::DeviceContext* device_context)
const platform::DeviceContext& device_context)
: InferShapeContext(op, scope), device_context_(device_context) {}
template <typename PlaceType,
@ -374,9 +374,9 @@ class ExecutionContext : public InferShapeContext {
typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
DeviceType& GetEigenDevice() const;
platform::Place GetPlace() const { return device_context_->GetPlace(); }
platform::Place GetPlace() const { return device_context_.GetPlace(); }
const platform::DeviceContext* device_context() const {
const platform::DeviceContext& device_context() const {
return device_context_;
}
@ -401,7 +401,8 @@ class ExecutionContext : public InferShapeContext {
return res;
}
const platform::DeviceContext* device_context_;
private:
const platform::DeviceContext& device_context_;
};
template <>
@ -461,7 +462,7 @@ class OperatorWithKernel : public OperatorBase {
void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const final {
auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
}
static std::unordered_map<std::string /* op_type */, OpKernelMap>&

@ -29,9 +29,9 @@ bool CudnnPoolLayer::typeCheck(const std::string &poolType,
if (mode) {
*mode = HL_POOLING_AVERAGE;
}
} else if (poolType == "cudnn-avg-excl-pad-pool") {
} else if (poolType == "cudnn-avg-incl-pad-pool") {
if (mode) {
*mode = HL_POOLING_AVERAGE_EXCLUDE_PADDING;
*mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
}
} else {
return false;

File diff suppressed because it is too large Load Diff

@ -0,0 +1,138 @@
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
typedef mkldnn::pooling_forward pool_fwd;
typedef mkldnn::pooling_backward pool_bwd;
/**
* @brief A subclass of MKLDNNLayer pool layer.
*
* The config file api is mkldnn_pool
*/
class MKLDNNPoolLayer : public MKLDNNLayer {
protected:
// padding height and width
int ph_, pw_;
// stride height and width
int sh_, sw_;
// filter(kenerl) height and width
int fh_, fw_;
// pooling_avg or pooling_max
mkldnn::algorithm poolAlgo_;
// MKLDNNMatrixPtr which should be created from CPU Device
MKLDNNMatrixPtr cpuOutVal_;
MKLDNNMatrixPtr cpuOutGrad_;
// convert handle between CPU device and MKLDNN device
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// save forward primitive_desc, which can be used backward
std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
// according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
// test_pooling_forward.cpp, pool need workspace for backward
std::shared_ptr<mkldnn::memory> workspace_;
public:
explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
~MKLDNNPoolLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void printSizeInfo() override {
MKLDNNLayer::printSizeInfo();
VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
<< ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
<< ", sw: " << sw_;
}
protected:
/**
* Forward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetInValue(MKLDNNMatrixPtr& in);
void resetOutValue(MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetOutGrad(MKLDNNMatrixPtr& out);
void resetInGrad(MKLDNNMatrixPtr& in);
void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
/**
* get padding_r according to
* https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
* test_pooling_forward.cpp
*/
mkldnn::memory::dims getPaddingR() const {
mkldnn::memory::dims padR = {ph_, pw_};
for (int i = 0; i < 2; ++i) {
if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
++padR[0];
}
if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
++padR[1];
}
}
return padR;
}
};
} // namespace paddle

@ -141,6 +141,68 @@ TEST(MKLDNNLayer, ConvLayer) {
testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
}
struct testPoolDesc {
int bs, ch; // input channel and output channel are the same
int ih, iw;
int oh, ow;
int fh, fw;
int ph, pw;
int sh, sw;
};
void testPoolLayer(const testPoolDesc& pm) {
const std::string compareTypes[] = {"mkldnn_pool", "pool"};
TestConfig cfg;
cfg.layerConfig.set_type(compareTypes[0]);
cfg.layerConfig.set_size(pm.ch * pm.oh * pm.ow);
cfg.inputDefs.push_back(
{INPUT_DATA,
"layer_0",
/* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
PoolConfig* pool = input->mutable_pool_conf();
// pool->set_pool_type(poolType);
pool->set_channels(pm.ch);
pool->set_img_size(pm.iw);
pool->set_img_size_y(pm.ih);
pool->set_output_x(pm.ow);
pool->set_output_y(pm.oh);
pool->set_size_x(pm.fw);
pool->set_size_y(pm.fh);
pool->set_padding(pm.pw);
pool->set_padding_y(pm.ph);
pool->set_stride(pm.sw);
pool->set_stride_y(pm.sh);
int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
CHECK_EQ(ow, pm.ow) << "output size check failed";
CHECK_EQ(oh, pm.oh) << "output size check failed";
MKLDNNTester tester;
for (auto type : {"max-projection", "avg-projection"}) {
pool->set_pool_type(type);
TestConfig ref = cfg;
ref.layerConfig.set_type(compareTypes[1]);
for (auto bs : {pm.bs, 1}) {
tester.run(cfg, ref, bs, pm.ih, pm.iw);
}
}
}
TEST(MkldnnLayer, PoolLayer) {
/* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw*/
testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
}
// TODO(TJ): add branch test
int main(int argc, char** argv) {

@ -17,6 +17,7 @@ limitations under the License. */
#include <cmath>
#include "BaseMatrix.h"
#include "MathFunctions.h"
#include "NEONFunctions.h"
#include "SIMDFunctions.h"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_base.cuh"
@ -666,6 +667,13 @@ void BaseMatrixT<T>::relu(BaseMatrixT& b) {
applyBinary(binary::Relu<T>(), b);
}
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
void BaseMatrixT<float>::relu(BaseMatrixT& b) {
neon::relu(data_, b.data_, height_ * width_);
}
#endif
DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
template <class T>
void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {

File diff suppressed because it is too large Load Diff

@ -0,0 +1,55 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "NEONFunctions.h"
#include <arm_neon.h>
namespace paddle {
namespace neon {
// b[i] = a[i] > 0.0f ? a[i] : 0.0f
void relu(const float* a, float* b, int len) {
int offset = len % 16;
float32x4_t ma0, ma1, ma2, ma3;
float32x4_t mb0, mb1, mb2, mb3;
float32x4_t zero = vdupq_n_f32(0.f);
for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = vld1q_f32(a);
ma1 = vld1q_f32(a + 4);
ma2 = vld1q_f32(a + 8);
ma3 = vld1q_f32(a + 12);
mb0 = vmaxq_f32(ma0, zero);
mb1 = vmaxq_f32(ma1, zero);
mb2 = vmaxq_f32(ma2, zero);
mb3 = vmaxq_f32(ma3, zero);
vst1q_f32(b, mb0);
vst1q_f32(b + 4, mb1);
vst1q_f32(b + 8, mb2);
vst1q_f32(b + 12, mb3);
}
for (int i = 0; i < offset; i++) {
b[i] = a[i] > 0.0f ? a[i] : 0.0f;
}
}
} // namespace neon
} // namespace paddle
#endif

@ -0,0 +1,23 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace neon {
void relu(const float* a, float* b, int len);
} // namespace neon
} // namespace paddle

@ -825,9 +825,8 @@ void testMaxPoolFwdBwd(int numSamples,
int strideW,
int padH,
int padW) {
int outH = 0, outW = 0;
outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
int inWidth = imgSizeH * imgSizeW * channels;
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
@ -927,9 +926,8 @@ void testAvgPoolFwdBwd(int numSamples,
int strideW,
int padH,
int padW) {
int outH = 0, outW = 0;
outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
int inWidth = imgSizeH * imgSizeW * channels;
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);

@ -12,26 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/reduce.h>
#include "paddle/operators/accuracy_op.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
__global__ void AccuracySingleKernel(const int N, const int D, const int top_k,
const int* Xdata, const int* labelData,
float* accuracy) {
int correct = 0;
for (int row = 0; row < N; row++) {
const int label = labelData[row];
for (int col = 0; col < D; col++) {
const int pred = Xdata[row * D + col];
if (pred == label) {
++correct;
template <int BlockSize>
__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
const int* labeldata, float* accuracy) {
int count = 0;
__shared__ int total[BlockSize];
// support only 1 block
for (int i = threadIdx.x; i < (N); i += BlockSize) {
for (int j = 0; j < D; ++j) {
if (Xdata[i * D + j] == labeldata[i]) {
++count;
break;
}
}
}
*accuracy = static_cast<float>(correct) / static_cast<float>(N);
total[threadIdx.x] = count;
__syncthreads();
// reduce the count with init value 0, and output accuracy.
int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
if (threadIdx.x == 0) {
*accuracy = static_cast<float>(result) / static_cast<float>(N);
}
}
template <typename T>
@ -57,8 +69,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
return;
}
AccuracySingleKernel<<<1, 1>>>(num_samples, infer_width, 1, inference_data,
label_data, accuracy_data);
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS>>>(
num_samples, infer_width, inference_data, label_data, accuracy_data);
}
};

@ -0,0 +1,197 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class FCOp : public NetOp {
public:
FCOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE(!Inputs("X").empty(),
"Inputs(X) of FCOp should not be null.");
PADDLE_ENFORCE(!Inputs("W").empty(),
"Inputs(W) of FCOp should not be null.");
PADDLE_ENFORCE(!Outputs("MulOut").empty(),
"Outputs(MulOut) of FCOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of FCOp should not be null.");
auto x = Inputs("X");
auto w = Inputs("W");
auto mul_out = Outputs("MulOut");
PADDLE_ENFORCE_EQ(
x.size(), w.size(),
"The size of inputs X(%d) should be the same as that of weights W(%d).",
x.size(), w.size());
PADDLE_ENFORCE_EQ(mul_out.size(), x.size(),
"The size of intermediate mul_out(%d) should be the same "
"as that of inputs X(%d).",
mul_out.size(), x.size());
size_t n = x.size();
PADDLE_ENFORCE_GE(n, static_cast<size_t>(1),
"The size of inputs X(%d) should be no less than 1.", n);
auto x_num_col_dims = Attr<std::vector<int>>("xNumColDims");
// Set all values or set no values (use the default value)
if (!x_num_col_dims.empty()) {
PADDLE_ENFORCE_EQ(x_num_col_dims.size(), n,
"The size of attribute xNumColDims(%d) should be the "
"same as that of inputs X(%d).",
x_num_col_dims.size(), n);
} else {
x_num_col_dims.resize(n);
for (size_t i = 0; i < n; i++) {
x_num_col_dims[i] = 1;
}
}
// mul_out[i] = X[i] * W[i]
for (size_t i = 0; i < n; i++) {
framework::AttributeMap mul_attr;
mul_attr["x_num_col_dims"] = static_cast<int>(x_num_col_dims[i]);
mul_attr["y_num_col_dims"] = static_cast<int>(1);
AppendOp(
framework::OpRegistry::CreateOp("mul", {{"X", {x[i]}}, {"Y", {w[i]}}},
{{"Out", {mul_out[i]}}}, mul_attr));
}
// sum_out = X[0] * W[0] + ... + X[n-1] * W[n-1]
auto sum_out = mul_out[0];
if (n > 1) {
PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName,
"Output(SumOut) of FCOp should not be null when the "
"size of Inputs(X) > 1.");
sum_out = Output("SumOut");
AppendOp(framework::OpRegistry::CreateOp("sum", {{"X", {mul_out}}},
{{"Out", {sum_out}}}, {}));
} else {
if (Output("SumOut") != framework::kEmptyVarName) {
this->Rename(Output("SumOut"), framework::kEmptyVarName);
}
}
// add_out = sum_out + b
auto b = Input("B");
auto add_out = sum_out;
if (b != framework::kEmptyVarName) {
PADDLE_ENFORCE_NE(
Output("AddOut"), framework::kEmptyVarName,
"Output(AddOut) of FCOp should not be null when Input(B) is set.");
add_out = Output("AddOut");
AppendOp(framework::OpRegistry::CreateOp(
"rowwise_add", {{"X", {sum_out}}, {"b", {Input("B")}}},
{{"Out", {add_out}}}, {}));
} else {
if (Output("AddOut") != framework::kEmptyVarName) {
this->Rename(Output("AddOut"), framework::kEmptyVarName);
}
}
auto activation = Attr<std::string>("activation");
AppendOp(framework::OpRegistry::CreateOp(activation, {{"X", {add_out}}},
{{"Y", {Output("Out")}}}, {}));
CompleteAddOp(false);
}
};
class FCOpMaker : public framework::OpProtoAndCheckerMaker {
public:
FCOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(A vector of Tensors) each input Tensor can be of arbitrary "
"dimension, and will be reshaped to a 2-D matrix of size "
"(minibatch, number_of_input_features) according to attribute "
"xNumColDims.")
.AsDuplicable();
AddInput("W",
"(A vector of Tensors) the weights of FC operator, a "
"vector of 2-D matrix of size "
"(number_of_input_features, number_of_neurons).")
.AsDuplicable();
AddInput("B",
"(Tensor) the bias of FC operator, a 1-D vector of size "
"number_of_neurons.");
AddOutput("Out",
"(Tensor) the activated output matrix of FC operator, a 2-D "
"matrix of size (minibatch, number_of_neurons).");
AddOutput("MulOut",
"(A vector of Tensors) the intermediate outputs of FC operator, "
"each Tensor saving the product of X_i * W_i.")
.AsIntermediate()
.AsDuplicable();
AddOutput(
"SumOut",
"(Tensor) the intermediate output of FC operator, "
"saving the sum of the products of X and W, that is sum{X_i * W_i}.")
.AsIntermediate();
AddOutput("AddOut",
"(Tensor) the non-actived output of FC operator, "
"saving sum{X_i * W_i} + B.")
.AsIntermediate();
AddAttr<std::string>(
"activation",
"(string, default identity) the activation type of FC operator.")
.SetDefault("identity")
.InEnum({"identity", "sigmoid", "softmax"});
AddAttr<std::vector<int>>(
"xNumColDims",
"(std::vector<int>) The inputs Tensors of FC operator can be of "
"more than 2 dimensions. In that case, each input Tensor `X_i` will be "
"reshaped to a 2-D matrix. The matrix's first dimension "
"(the length of column) will be the product of `X_i`'s last "
"`xNumColDims_i` dimensions, that is "
"`X_i.dims[0] x ... x X_i.dims[xNumColDims_i - 1]`. "
"The matrix's second dimension (the length of row) will be the product "
"of `X_i`'s first `rank - xNumColDims_i` dimensions, that is "
"`X_i.dims[xNumColDims_i] x ... x X_i.dims[rank - 1]`)")
.SetDefault(std::vector<int>{});
AddComment(R"DOC(
Fully Connected Operator, known as Fully Connected Layer or Inner Product Layer
in Convolutional Neural Networks. Neurons in a fully connected layer have
full connections to all activations in the previous layer.
It computes an inner product of a set of
learned weights with a matrix multiplication followed by a bias offset
(optionally).
Equation:
Out = Act(sum_n{X_i * W_i} + B)
where X_i is Tensor that will be reshaped to a 2-D matrix of size (M x K),
usually M is the minibatch size and K is the number of input features.
W_i is a 2-D matrix of size (K x N), where N means the number of neurons
in the fully connected layer. B is a 1-D vector of size N.
Thus, the output Out is a 2-D matrix of size (M x N).
Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fc, ops::FCOp, ops::FCOpMaker);

@ -27,7 +27,7 @@ class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of identity operator.");
AddOutput("Out", "The output tensor of identity operator.");
AddOutput("Y", "The output tensor of identity operator.");
AddComment(R"DOC(
The identity operator is an alias of the scale operator
with the attribute scale fixed to 1.0.
@ -44,12 +44,13 @@ class IdentityOp : public NetOp {
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
"Input(X) of IdentityOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of IdentityOp should not be null.");
PADDLE_ENFORCE_NE(Output("Y"), framework::kEmptyVarName,
"Output(Y) of IdentityOp should not be null.");
AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Y")}}},
{{"scale", static_cast<AttrType>(1)}}));
CompleteAddOp(false);
}
};

@ -19,12 +19,13 @@ namespace operators {
namespace math {
template <>
void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const float alpha, const float* A,
const float* B, const float beta, float* C,
platform::DeviceContext* context) {
const float* B, const float beta,
float* C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
@ -33,13 +34,13 @@ void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
}
template <>
void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const double alpha, const double* A,
const double* B, const double beta,
double* C,
platform::DeviceContext* context) {
double* C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
@ -48,13 +49,10 @@ void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
}
template <>
void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, float alpha,
framework::Tensor* matrix_out,
float beta,
platform::DeviceContext* context) {
void matmul<platform::CPUPlace, float>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
framework::Tensor* matrix_out, float beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
@ -74,18 +72,15 @@ void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CPUPlace, float>(
transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
}
template <>
void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, double alpha,
framework::Tensor* matrix_out,
double beta,
platform::DeviceContext* context) {
void matmul<platform::CPUPlace, double>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
framework::Tensor* matrix_out, double beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
@ -105,8 +100,8 @@ void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CPUPlace, double>(
transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
} // namespace math

@ -19,12 +19,13 @@ namespace operators {
namespace math {
template <>
void gemm<platform::GPUPlace, float>(const CBLAS_TRANSPOSE transA,
void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const float alpha, const float* A,
const float* B, const float beta, float* C,
platform::DeviceContext* context) {
const float* B, const float beta,
float* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
@ -35,18 +36,19 @@ void gemm<platform::GPUPlace, float>(const CBLAS_TRANSPOSE transA,
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
PADDLE_ENFORCE(platform::dynload::cublasSgemm(
reinterpret_cast<platform::CUDADeviceContext*>(context)->cublas_handle(),
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.cublas_handle(),
cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
}
template <>
void gemm<platform::GPUPlace, double>(const CBLAS_TRANSPOSE transA,
void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const double alpha, const double* A,
const double* B, const double beta,
double* C,
platform::DeviceContext* context) {
double* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
@ -56,18 +58,16 @@ void gemm<platform::GPUPlace, double>(const CBLAS_TRANSPOSE transA,
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
PADDLE_ENFORCE(platform::dynload::cublasDgemm(
reinterpret_cast<platform::CUDADeviceContext*>(context)->cublas_handle(),
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.cublas_handle(),
cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
}
template <>
void matmul<platform::GPUPlace, float>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, float alpha,
framework::Tensor* matrix_out,
float beta,
platform::DeviceContext* context) {
void matmul<platform::GPUPlace, float>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
framework::Tensor* matrix_out, float beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
@ -87,18 +87,15 @@ void matmul<platform::GPUPlace, float>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::GPUPlace, float>(
transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
}
template <>
void matmul<platform::GPUPlace, double>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, double alpha,
framework::Tensor* matrix_out,
double beta,
platform::DeviceContext* context) {
void matmul<platform::GPUPlace, double>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
framework::Tensor* matrix_out, double beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
@ -118,8 +115,8 @@ void matmul<platform::GPUPlace, double>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::GPUPlace, double>(
transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
} // namespace math

@ -66,16 +66,16 @@ namespace math {
// For more detailed info, please refer to
// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
template <typename Place, typename T>
void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
const int M, const int N, const int K, const T alpha, const T* A,
const T* B, const T beta, T* C, platform::DeviceContext* context);
void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const T alpha, const T* A, const T* B, const T beta, T* C);
// matrix multiply with continuous memory
template <typename Place, typename T>
void matmul(const framework::Tensor& matrix_a, bool trans_a,
void matmul(const platform::DeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, T alpha,
framework::Tensor* matrix_out, T beta,
platform::DeviceContext* context);
framework::Tensor* matrix_out, T beta);
} // namespace math
} // namespace operators

@ -15,8 +15,7 @@ TEST(math_function, notrans_mul_trans) {
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::DeviceContext* context =
new paddle::platform::CUDADeviceContext(*gpu_place);
paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place);
input2_gpu.CopyFrom<float>(input1, *gpu_place);
@ -24,7 +23,7 @@ TEST(math_function, notrans_mul_trans) {
out_gpu.mutable_data<float>({2, 2}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0, context);
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
out.CopyFrom<float>(out_gpu, *cpu_place);
@ -33,6 +32,7 @@ TEST(math_function, notrans_mul_trans) {
EXPECT_EQ(out_ptr[1], 14);
EXPECT_EQ(out_ptr[2], 14);
EXPECT_EQ(out_ptr[3], 50);
delete gpu_place;
}
TEST(math_function, trans_mul_notrans) {
@ -48,8 +48,7 @@ TEST(math_function, trans_mul_notrans) {
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::DeviceContext* context =
new paddle::platform::CUDADeviceContext(*gpu_place);
paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place);
input2_gpu.CopyFrom<float>(input1, *gpu_place);
@ -57,7 +56,7 @@ TEST(math_function, trans_mul_notrans) {
out_gpu.mutable_data<float>({3, 3}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0, context);
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
out.CopyFrom<float>(out_gpu, *cpu_place);
@ -71,5 +70,6 @@ TEST(math_function, trans_mul_notrans) {
EXPECT_EQ(out_ptr[6], 15);
EXPECT_EQ(out_ptr[7], 22);
EXPECT_EQ(out_ptr[8], 29);
delete gpu_place;
}
#endif

@ -71,7 +71,7 @@ class MinusGradOp : public NetOp {
// x_grad = out_grad
AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
{{"Out", {x_grad}}}, {}));
{{"Y", {x_grad}}}, {}));
framework::AttributeMap scale_attr;
scale_attr["scale"] = static_cast<AttrType>(-1);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save