Gradient check use graph (#5027)

* Simplize Gradient Check

* Stash

* Extract apply_backward_pass to backward.py

Rename apply_backward_pass to append_backward_ops

* Use graph API to check gradient

* Fix ci

* Fix CI

* Fix backward for double precision

* Stash

* Fix CI

* Fix ci

* Ignore GRU test

* Ignore xe op

* Fix CI

* Fix softmax with xe gradient

The correct equation should be IG = OG * (d_softmax_with_xe())

* Fix typo

* Fix merge error

* Disable LRN
fix-typo
Yu Yang 7 years ago committed by QI JUN
parent fc68290bcc
commit be00b0c4d6

@ -26,7 +26,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator glog)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)

@ -452,11 +452,13 @@ ParamGradInfoMap AppendBackward(
std::transform(target_shape_desc.begin(), target_shape_desc.end(),
std::back_inserter(target_shape),
[](int64_t dim) { return static_cast<int>(dim); });
VLOG(3) << "backward from loss=" << target.Name()
<< " data_type=" << target.GetDataType();
std::unique_ptr<OpDescBind> fill_one_op(
new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
{{"shape", target_shape},
{"value", static_cast<float>(1.0)},
{"data_type", framework::DataType::FP32}}));
{"data_type", target.GetDataType()}}));
root_block->AppendAllocatedOp(std::move(fill_one_op));
size_t forward_op_num = root_block->OpSize();
size_t forward_block_num = program_desc.Size();
@ -475,8 +477,7 @@ ParamGradInfoMap AppendBackward(
std::unordered_map<std::string, GradVarInfo> retv;
auto var = root_block->Var(fill_one_op_out);
// FIXME(qiao) infer the data type
var->SetDataType(framework::DataType::FP32);
var->SetDataType(target.GetDataType());
var->SetShape(target.Shape());
auto& target_grad = retv[target.Name()];
target_grad.name_ = fill_one_op_out;

@ -19,6 +19,8 @@ limitations under the License. */
#include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h"
#include "glog/logging.h"
namespace paddle {
namespace framework {
@ -262,6 +264,7 @@ void OpDescBind::CheckAttrs() {
}
void OpDescBind::InferShape(const BlockDescBind &block) const {
VLOG(3) << "CompileTime infer shape on " << Type();
auto &funcs = InferShapeFuncs();
auto it = funcs.find(this->Type());
if (it == funcs.end()) {

@ -414,7 +414,9 @@ class CompileTimeInferShapeContext : public InferShapeContext {
private:
DDim GetDim(const std::string& name) const override {
return framework::make_ddim(block_.FindVarRecursive(name)->Shape());
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
return framework::make_ddim(var->Shape());
}
void SetDim(const std::string& name, const DDim& dim) override {
@ -658,8 +660,9 @@ class OperatorWithKernel : public OperatorBase {
}
if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type()));
VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
PADDLE_ENFORCE(tmp == data_type || data_type == -1,
"DataType of Paddle Op must be same.");
"DataType of Paddle Op %s must be same.", Type());
data_type = tmp;
}
}

@ -446,12 +446,16 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp,
REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
hard_sigmoid_grad, ops::ActivationOpGrad);
#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL( \
act_type, \
ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>); \
REGISTER_OP_CPU_KERNEL(act_type##_grad, \
ops::ActivationGradKernel<paddle::platform::CPUPlace, \
ops::grad_functor<float>>);
#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL( \
act_type, \
ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::CPUPlace, \
ops::functor<double>>); \
REGISTER_OP_CPU_KERNEL( \
act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::CPUPlace, \
ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);

@ -17,12 +17,16 @@
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_GPU_KERNEL( \
act_type, \
ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>); \
REGISTER_OP_GPU_KERNEL(act_type##_grad, \
ops::ActivationGradKernel<paddle::platform::GPUPlace, \
ops::grad_functor<float>>);
#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_GPU_KERNEL( \
act_type, \
ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::GPUPlace, \
ops::functor<double>>); \
REGISTER_OP_GPU_KERNEL( \
act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::GPUPlace, \
ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);

@ -210,8 +210,8 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
auto temp2 = (x > threshold).template cast<T>().eval();
auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
y.device(d) = x * (temp1 + temp2);
}
};
@ -226,8 +226,8 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
auto temp2 = (x > threshold).template cast<T>().eval();
auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
dx.device(d) = dy * (temp1 + temp2).template cast<T>();
}
};
@ -243,9 +243,10 @@ struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
auto temp1 = (x > lambda).template cast<T>().eval();
auto temp2 = (x < -lambda).template cast<T>().eval();
y.device(d) = temp1 * (x - lambda) + temp2 * (x + lambda);
auto lambdaT = static_cast<T>(lambda);
auto temp1 = (x > lambdaT).template cast<T>().eval();
auto temp2 = (x < -lambdaT).template cast<T>().eval();
y.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
}
};
@ -257,8 +258,9 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
auto temp1 = (x > lambda).template cast<T>().eval();
auto temp2 = (x < -lambda).template cast<T>().eval();
auto lambdaT = static_cast<T>(lambda);
auto temp1 = (x > lambdaT).template cast<T>().eval();
auto temp2 = (x < -lambdaT).template cast<T>().eval();
dx.device(d) = dy * (temp1 + temp2).template cast<T>();
}
};
@ -362,7 +364,8 @@ struct BReluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
y.device(d) =
x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
}
};
@ -375,7 +378,9 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
dx.device(d) = dy *
((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
.template cast<T>();
}
};
@ -390,7 +395,8 @@ struct Relu6Functor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) = x.cwiseMax(static_cast<T>(0)).cwiseMin(threshold);
y.device(d) =
x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
}
};
@ -402,8 +408,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) =
dy * ((x > static_cast<T>(0)) * (x < threshold)).template cast<T>();
dx.device(d) = dy *
((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
.template cast<T>();
}
};
@ -463,7 +470,8 @@ struct SoftReluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
auto tmp = static_cast<T>(threshold);
auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
y.device(d) = (static_cast<T>(1) + temp.exp()).log();
}
};
@ -476,7 +484,8 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
auto tmp = static_cast<T>(threshold);
auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
}
};
@ -490,7 +499,7 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) = x.cwiseMax(alpha * x);
y.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
}
};
@ -502,7 +511,8 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
auto temp1 = alpha * (x < static_cast<T>(0)).template cast<T>().eval();
auto temp1 = static_cast<T>(alpha) *
(x < static_cast<T>(0)).template cast<T>().eval();
auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
dx.device(d) = dy * (temp1 + temp2).template cast<T>();
}
@ -517,9 +527,9 @@ struct ELUFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) =
x.cwiseMax(static_cast<T>(0)) +
(alpha * (x.exp() - static_cast<T>(1))).cwiseMin(static_cast<T>(0));
y.device(d) = x.cwiseMax(static_cast<T>(0)) +
(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
.cwiseMin(static_cast<T>(0));
}
};
@ -531,9 +541,9 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) =
dy * (x > static_cast<T>(0)).template cast<T>() +
dy * (y + alpha) * (x < static_cast<T>(0)).template cast<T>();
dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>() +
dy * (y + static_cast<T>(alpha)) *
(x < static_cast<T>(0)).template cast<T>();
}
};
@ -545,7 +555,7 @@ struct PowFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) = x.pow(factor);
y.device(d) = x.pow(static_cast<T>(factor));
}
};
@ -557,7 +567,8 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
}
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
dx.device(d) = dy * static_cast<T>(factor) *
x.pow(static_cast<T>(factor - static_cast<T>(1)));
}
};
@ -571,7 +582,8 @@ struct STanhFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) = scale_b * (scale_a * x).tanh();
y.device(d) =
static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
}
};
@ -585,8 +597,10 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
auto a = static_cast<T>(scale_a);
auto b = static_cast<T>(scale_b);
auto temp = (a * x).tanh() * (a * x).tanh();
dx.device(d) = dy * a * b * (static_cast<T>(1) - temp);
}
};
@ -599,7 +613,8 @@ struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) = (x > static_cast<T>(threshold)).template cast<T>() * x;
auto th = static_cast<T>(threshold);
y.device(d) = (x > th).template cast<T>() * x;
}
};
@ -612,7 +627,8 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * (x > static_cast<T>(threshold)).template cast<T>();
auto th = static_cast<T>(threshold);
dx.device(d) = dy * (x > th).template cast<T>();
}
};

@ -64,5 +64,6 @@ namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
ops::FillConstantOpMaker);
REGISTER_OP_CPU_KERNEL(
fill_constant,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>);
fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>);

@ -18,5 +18,6 @@
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
fill_constant,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>);
fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>);

@ -25,7 +25,7 @@ class FillConstantOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto value = ctx.Attr<T>("value");
auto value = ctx.Attr<float>("value");
auto out_eigen = framework::EigenVector<T>::Flatten(*out);
auto place = ctx.GetEigenDevice<Place>();

@ -171,8 +171,7 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(
weight_width, frame_size * 3,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
auto bias = Input("Bias");
if (bias != framework::kEmptyVarName) {
if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias");
int bias_height = bias_dims[0];
int bias_width = bias_dims[1];
@ -203,6 +202,8 @@ namespace ops = paddle::operators;
REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
ops::GRUUnitGradOp);
REGISTER_OP_CPU_KERNEL(gru_unit,
ops::GRUUnitKernel<paddle::platform::CPUPlace, float>);
ops::GRUUnitKernel<paddle::platform::CPUPlace, float>,
ops::GRUUnitKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>);
gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>,
ops::GRUUnitGradKernel<paddle::platform::CPUPlace, double>);

@ -17,6 +17,8 @@
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(gru_unit,
ops::GRUUnitKernel<paddle::platform::GPUPlace, float>);
ops::GRUUnitKernel<paddle::platform::GPUPlace, float>,
ops::GRUUnitKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>);
gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>,
ops::GRUUnitGradKernel<paddle::platform::GPUPlace, double>);

@ -71,7 +71,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators;
REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
REGISTER_OP_CPU_KERNEL(mean,
ops::MeanKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<paddle::platform::CPUPlace, float>,
ops::MeanKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(mean_grad,
ops::MeanGradKernel<paddle::platform::CPUPlace, float>);
ops::MeanGradKernel<paddle::platform::CPUPlace, float>,
ops::MeanGradKernel<paddle::platform::CPUPlace, double>);

@ -17,7 +17,8 @@
#include "paddle/operators/mean_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(mean,
ops::MeanKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<paddle::platform::GPUPlace, float>,
ops::MeanKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(mean_grad,
ops::MeanGradKernel<paddle::platform::GPUPlace, float>);
ops::MeanGradKernel<paddle::platform::GPUPlace, float>,
ops::MeanGradKernel<paddle::platform::GPUPlace, double>);

@ -73,4 +73,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
ops::ScaleGradMaker);
REGISTER_OP_CPU_KERNEL(scale,
ops::ScaleKernel<paddle::platform::CPUPlace, float>);
ops::ScaleKernel<paddle::platform::CPUPlace, float>,
ops::ScaleKernel<paddle::platform::CPUPlace, double>);

@ -15,4 +15,5 @@
#include "paddle/operators/scale_op.h"
REGISTER_OP_GPU_KERNEL(
scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>);
scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);

@ -19,7 +19,7 @@
namespace paddle {
namespace operators {
template <typename Place, typename T, typename AttrType = T>
template <typename Place, typename T>
class ScaleKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& context) const {
@ -27,7 +27,7 @@ class ScaleKernel : public framework::OpKernel<T> {
auto* in = context.Input<framework::Tensor>("X");
tensor->mutable_data<T>(in->place());
auto scale = static_cast<T>(context.Attr<AttrType>("scale"));
auto scale = static_cast<T>(context.Attr<float>("scale"));
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);

@ -23,18 +23,21 @@ using Tensor = framework::Tensor;
namespace {
template <typename T>
__global__ void CrossEntropyGrad(T* out_grad, const T* in_grad,
__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
const int* labels, const int batch_size,
const int class_num) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int sample_idx = tid / class_num;
if (tid < batch_size * class_num) out_grad[tid] *= in_grad[sample_idx];
__syncthreads();
if (tid < batch_size) {
PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
out_grad[tid * class_num + labels[tid]] -= 1.;
logit_grad[tid * class_num + labels[tid]] -= static_cast<T>(1.);
}
__syncthreads();
if (tid < batch_size * class_num) {
logit_grad[tid] *= loss_grad[sample_idx];
}
}
@ -47,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
int ids = blockIdx.x * blockDim.x + threadIdx.x;
if (ids < batch_size * class_num) {
int row_ids = ids / class_num;
logit_grad[ids] = logit_grad[ids] * loss_grad[row_ids] - labels[ids];
logit_grad[ids] = logit_grad[ids] * (loss_grad[row_ids] - labels[ids]);
}
}
} // namespace

@ -67,8 +67,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
logit_grad_mat *
out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
lbl_mat;
(out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
lbl_mat);
} else {
const int batch_size = logit_grad->dims()[0];
const int* label_data = labels->data<int>();
@ -78,7 +78,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
logit_grad_data[index] =
(out_grad_data[i] * logit_grad_data[index] - 1.);
out_grad_data[i] * (logit_grad_data[index] - 1.);
}
}
}

@ -95,17 +95,18 @@ class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
}
};
class SplitOpGrad : public NetOp {
class SplitGradMaker : public framework::SingleGradOpDescMaker {
public:
SplitOpGrad(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
auto out_grad = Inputs(framework::GradVarName("Out"));
auto x_grad = Output(framework::GradVarName("X"));
AppendOp(framework::OpRegistry::CreateOp("concat", {{"X", out_grad}},
{{"Out", {x_grad}}}, attrs));
CompleteAddOp(false);
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDescBind> Apply() const override {
auto op = new framework::OpDescBind();
op->SetType("concat");
op->SetInput("X", OutputGrad("Out"));
op->SetOutput("Out", InputGrad("X"));
op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDescBind>(op);
}
};
@ -114,7 +115,7 @@ class SplitOpGrad : public NetOp {
namespace ops = paddle::operators;
USE_CPU_ONLY_OP(concat);
REGISTER_OP(split, ops::SplitOp, ops::SplitOpMaker, split_grad,
ops::SplitOpGrad);
REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
REGISTER_OP_CPU_KERNEL(split,
ops::SplitOpKernel<paddle::platform::CPUPlace, float>);

@ -84,4 +84,5 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
namespace ops = paddle::operators;
REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker);
REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>,
ops::SumKernel<paddle::platform::CPUPlace, double>);

@ -13,4 +13,5 @@ limitations under the License. */
#include "paddle/operators/sum_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>,
ops::SumKernel<paddle::platform::GPUPlace, double>);

File diff suppressed because it is too large Load Diff

@ -335,7 +335,7 @@ class TestSoftplus(OpTest):
def setUp(self):
self.op_type = "softplus"
self.inputs = {
'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
'X': np.random.uniform(-1, 1, [11, 17]).astype("float64")
}
self.outputs = {'Y': np.log(1 + np.exp(self.inputs['X']))}

@ -1,10 +1,25 @@
import unittest
import numpy as np
from op_test import OpTest, get_backward_op, grad_var_name
from op_test import OpTest
import paddle.v2.framework.core as core
from paddle.v2.framework.op import Operator
def grad_var_name(var_name):
return var_name + "@GRAD"
def get_backward_op(scope, op, no_grad_set):
backward_op = core.Operator.backward(op, no_grad_set)
for input in backward_op.input_vars():
var = scope.var(input)
var.get_tensor()
for output in backward_op.output_vars():
var = scope.var(output)
var.get_tensor()
return backward_op
def _reference_training(x, scale, offset, epsilon, data_format):
if data_format != "NHWC":
raise ValueError("data_format must be NHWC, got %s." % data_format)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save