From 5b93ac7778754943914b3f9511208e55a3526f64 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 4 Sep 2017 15:49:33 +0800 Subject: [PATCH 1/6] package a new USE_NO_KERNEL_OP for USE_OP_ITSELF --- doc/howto/dev/new_op_cn.md | 6 ++++++ paddle/framework/backward.cc | 2 +- paddle/framework/op_registry.h | 2 ++ paddle/operators/minus_op.cc | 2 +- paddle/operators/recurrent_op.cc | 2 +- paddle/pybind/pybind.cc | 4 ++-- python/paddle/v2/framework/op.py | 2 +- 7 files changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md index ec79b7f42b..dfcbce9037 100644 --- a/doc/howto/dev/new_op_cn.md +++ b/doc/howto/dev/new_op_cn.md @@ -227,6 +227,12 @@ make mul_op USE_CPU_ONLY_OP(gather); ``` + 如果OP不带Kernel,则使用`USE_NO_KENREL_OP`: + + ``` + USE_NO_KENREL_OP(recurrent); + ``` + 使用`USE_OP`告知编译器需要链接该Op的目标文件,具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。 diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 6b4c612cd8..c5d4662215 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -182,7 +182,7 @@ static std::unique_ptr BackwardRecursive( }); // process recurrent gradient op as a special operator. - if (forwardOp.Type() == "recurrent_op") { + if (forwardOp.Type() == "recurrent") { // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or // this will result in infinite loop. const auto& rnnop = diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 64c7f23ab6..27f1927aed 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -194,6 +194,8 @@ class OpKernelRegistrar : public Registrar { USE_OP_DEVICE_KERNEL(op_type, GPU) #endif +#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type); + #define USE_CPU_ONLY_OP(op_type) \ USE_OP_ITSELF(op_type); \ USE_OP_DEVICE_KERNEL(op_type, CPU); diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc index b4afebcd97..138cdbd563 100644 --- a/paddle/operators/minus_op.cc +++ b/paddle/operators/minus_op.cc @@ -79,7 +79,7 @@ class MinusGradOp : public NetOp { } // namespace paddle USE_OP(scale); -USE_OP_ITSELF(identity); +USE_NO_KERNEL_OP(identity); namespace ops = paddle::operators; REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradOp); REGISTER_OP_CPU_KERNEL(minus, diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 16bd249cb3..e826703c60 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -235,5 +235,5 @@ RecurrentGradientOp::RecurrentGradientOp( } // namespace paddle REGISTER_OP_WITHOUT_GRADIENT( - recurrent_op, paddle::operators::RecurrentOp, + recurrent, paddle::operators::RecurrentOp, paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker); diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 3bc150ccb7..6896422617 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -39,12 +39,12 @@ USE_OP(sigmoid); USE_OP(softmax); USE_OP(rowwise_add); USE_OP(fill_zeros_like); -USE_OP_ITSELF(recurrent_op); +USE_NO_KERNEL_OP(recurrent); USE_OP(gaussian_random); USE_OP(uniform_random); USE_OP(lookup_table); USE_OP(scale); -USE_OP_ITSELF(identity); +USE_NO_KERNEL_OP(identity); USE_OP(minus); USE_CPU_ONLY_OP(gather); USE_CPU_ONLY_OP(scatter); diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py index 6ac656321e..e7e932f6fe 100644 --- a/python/paddle/v2/framework/op.py +++ b/python/paddle/v2/framework/op.py @@ -179,7 +179,7 @@ class OperatorFactory(object): class __RecurrentOp__(object): __proto__ = None - type = 'recurrent_op' + type = 'recurrent' def __init__(self): # cache recurrent_op's proto From 7c785426e4f15c7ef3a6659e1d32385ed5e48273 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 4 Sep 2017 22:04:08 +0800 Subject: [PATCH 2/6] add missing protostr for sub_nested_seq_layer. --- ..._layers.protostr => test_sub_nested_seq_select_layer.protostr} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/paddle/trainer_config_helpers/tests/configs/protostr/{test_seq_select_layers.protostr => test_sub_nested_seq_select_layer.protostr} (100%) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr similarity index 100% rename from python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr rename to python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr From 447033296d927dd0b0c1240e2ecccaa667eb0fe8 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 4 Sep 2017 22:29:34 +0800 Subject: [PATCH 3/6] Make some operator correctly handle gradients for multi inputs. --- paddle/operators/mul_op.cc | 4 +-- paddle/operators/mul_op.h | 36 ++++++++++--------- paddle/operators/rowwise_add_op.cc | 6 ++-- paddle/operators/rowwise_add_op.h | 24 +++++++------ paddle/operators/scatter_op.cc | 4 +-- paddle/operators/scatter_op.h | 10 +++--- .../v2/framework/tests/gradient_checker.py | 25 +++++++++++-- .../paddle/v2/framework/tests/test_mul_op.py | 3 +- 8 files changed, 72 insertions(+), 40 deletions(-) diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 8d0f59745f..603dc7f4bd 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -75,8 +75,8 @@ class MulOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE(y_dims[1] == out_dims[1], "Out@GRAD M X N must equal to Y dims 1, N "); - x_grad->Resize(x_dims); - y_grad->Resize(y_dims); + if (x_grad) x_grad->Resize(x_dims); + if (y_grad) y_grad->Resize(y_dims); } }; diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 8facc02814..66ed2f81c7 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -31,13 +31,13 @@ template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Y = context.Input("Y"); - auto* Z = context.Output("Out"); - Z->mutable_data(context.GetPlace()); + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + z->mutable_data(context.GetPlace()); auto* device_context = const_cast(context.device_context_); - math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + math::matmul(*x, false, *y, false, 1, z, 0, device_context); } }; @@ -45,20 +45,24 @@ template class MulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dY = ctx.Output(framework::GradVarName("Y")); - dX->mutable_data(ctx.GetPlace()); - dY->mutable_data(ctx.GetPlace()); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); auto* device_context = const_cast(ctx.device_context_); - // dX = dOut * Y'. dX: M x K, dOut : M x N, Y : K x N - math::matmul(*dOut, false, *Y, true, 1, dX, 0, device_context); - // dY = X' * dOut. dY: K x N, dOut : M x N, X : M x K - math::matmul(*X, true, *dOut, false, 1, dY, 0, device_context); + if (dx) { + // dx = dout * y'. dx: M x K, dout : M x N, y : K x N + dx->mutable_data(ctx.GetPlace()); + math::matmul(*dout, false, *y, true, 1, dx, 0, device_context); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + // dy = x' * dout. dy K x N, dout : M x N, x : M x K + math::matmul(*x, true, *dout, false, 1, dy, 0, device_context); + } } }; diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc index 63de91254f..a9dfba3e95 100644 --- a/paddle/operators/rowwise_add_op.cc +++ b/paddle/operators/rowwise_add_op.cc @@ -64,8 +64,10 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel { auto dims0 = ctx.Input("X")->dims(); auto dims1 = ctx.Input("b")->dims(); PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1") - ctx.Output(framework::GradVarName("X"))->Resize(dims0); - ctx.Output(framework::GradVarName("b"))->Resize(dims1); + auto *dx = ctx.Output(framework::GradVarName("X")); + auto *db = ctx.Output(framework::GradVarName("b")); + if (dx) dx->Resize(dims0); + if (db) db->Resize(dims1); } }; diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h index 1cbd8bb31a..4e926d9f29 100644 --- a/paddle/operators/rowwise_add_op.h +++ b/paddle/operators/rowwise_add_op.h @@ -51,20 +51,24 @@ template class RowwiseAddGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* dOut = context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); + auto* dout = context.Input(framework::GradVarName("Out")); + auto* dx = context.Output(framework::GradVarName("X")); auto* db = context.Output(framework::GradVarName("b")); - dX->mutable_data(context.GetPlace()); - db->mutable_data(context.GetPlace()); - auto OutGrad = EigenMatrix::From(*dOut); + auto out_grad = EigenMatrix::From(*dout); auto place = context.GetEigenDevice(); - EigenMatrix::From(*dX).device(place) = OutGrad; + if (dx) { + dx->mutable_data(context.GetPlace()); + EigenMatrix::From(*dx).device(place) = out_grad; + } - // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html - // colwise add - Eigen::array dims{{0}}; /* dimension to reduce */ - EigenVector::Flatten(*db).device(place) = OutGrad.sum(dims); + if (db) { + db->mutable_data(context.GetPlace()); + // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html + // colwise add + Eigen::array dims{{0}}; /* dimension to reduce */ + EigenVector::Flatten(*db).device(place) = out_grad.sum(dims); + } } }; } // namespace operators diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc index 35c185ad80..9b5068f07c 100644 --- a/paddle/operators/scatter_op.cc +++ b/paddle/operators/scatter_op.cc @@ -50,8 +50,8 @@ class ScatterGradOp : public framework::OperatorWithKernel { auto *dRef = ctx.Output(framework::GradVarName("Ref")); auto *Ref = ctx.Input("Ref"); - dRef->Resize(Ref->dims()); - dUpdates->Resize(Updates->dims()); + if (dRef) dRef->Resize(Ref->dims()); + if (dUpdates) dUpdates->Resize(Updates->dims()); } }; diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h index e9595638a8..7551480211 100644 --- a/paddle/operators/scatter_op.h +++ b/paddle/operators/scatter_op.h @@ -49,10 +49,12 @@ class ScatterGradientOpKernel : public framework::OpKernel { auto *dOut = ctx.Input(framework::GradVarName("Out")); // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates += dO[Index] - Gather(ctx.GetPlace(), dOut, Index, dUpdates); + if (dRef) dRef->ShareDataWith(*dOut); + if (dUpdates) { + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates += dO[Index] + Gather(ctx.GetPlace(), dOut, Index, dUpdates); + } } }; diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py index 518f828bac..82ab7ad39b 100644 --- a/python/paddle/v2/framework/tests/gradient_checker.py +++ b/python/paddle/v2/framework/tests/gradient_checker.py @@ -286,6 +286,9 @@ class GradientChecker(unittest.TestCase): for no_grad in no_grad_set: if no_grad not in in_names: raise ValueError("no_grad should be in in_names") + if name in inputs_to_check: + raise ValueError("no_grad should not be in inputs_to_check") + backward_op = core.Operator.backward(forward_op, no_grad_set) places = [core.CPUPlace()] @@ -301,9 +304,25 @@ class GradientChecker(unittest.TestCase): check_names = [grad_var_name(name) for name in inputs_to_check] for place in places: - # get analytical gradients according to different device - analytic_grads = self.__get_gradient(forward_op, backward_op, - input_vars, check_names, place) + # analytic_grads = self.__get_gradient(forward_op, backward_op, + # input_vars, check_names, place) + # In fact, the above two lines can be used to replace following + # codes. But most of the gradient operators need to handle the case + # where one of more of the gradient of the input is not needed. + # We change the unit test framework to explicitly test whether + # the operator correctly handles this through follow codes. + # In addtion, if all the inputs have no gradients, the NOP operator + # will be returned by core.Operator.backward(). The following codes + # do not test this case. + analytic_grads = [] + for name in inputs_to_check: + no_grads = [name for name in no_grad_set] + no_grads.extend(filter(lambda x: x != name, inputs_to_check)) + backward_op = core.Operator.backward(forward_op, set(no_grads)) + # get analytical gradients according to different device + analytic_grads.extend( + self.__get_gradient(forward_op, backward_op, input_vars, + [grad_var_name(name)], place)) self.__assert_is_close(numeric_grads, analytic_grads, check_names, max_relative_error, "Gradient Check On %s" % str(place)) diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py index ee0d81a64e..81371b1d11 100644 --- a/python/paddle/v2/framework/tests/test_mul_op.py +++ b/python/paddle/v2/framework/tests/test_mul_op.py @@ -16,13 +16,14 @@ class TestMulOp(unittest.TestCase): self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} -class MulGradOpTest(GradientChecker): +class TestMulGradOp(GradientChecker): def test_mul(self): op = create_op("mul") inputs = { 'X': np.random.random((32, 84)).astype("float32"), 'Y': np.random.random((84, 100)).astype("float32") } + self.compare_grad(op, inputs) # mul op will enlarge the relative error self.check_grad( op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5) From 3d9d32a1c1462780ea1a5682a27ce7da090a4b74 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 4 Sep 2017 16:20:27 -0700 Subject: [PATCH 4/6] Invoke check_grad many times for no_grad_set --- .../v2/framework/tests/gradient_checker.py | 23 +++------------- .../paddle/v2/framework/tests/test_mul_op.py | 27 +++++++++++++++---- .../v2/framework/tests/test_rowwise_add_op.py | 16 ++++++++--- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py index 82ab7ad39b..b8d7e4ea43 100644 --- a/python/paddle/v2/framework/tests/gradient_checker.py +++ b/python/paddle/v2/framework/tests/gradient_checker.py @@ -286,7 +286,7 @@ class GradientChecker(unittest.TestCase): for no_grad in no_grad_set: if no_grad not in in_names: raise ValueError("no_grad should be in in_names") - if name in inputs_to_check: + if no_grad in inputs_to_check: raise ValueError("no_grad should not be in inputs_to_check") backward_op = core.Operator.backward(forward_op, no_grad_set) @@ -304,25 +304,8 @@ class GradientChecker(unittest.TestCase): check_names = [grad_var_name(name) for name in inputs_to_check] for place in places: - # analytic_grads = self.__get_gradient(forward_op, backward_op, - # input_vars, check_names, place) - # In fact, the above two lines can be used to replace following - # codes. But most of the gradient operators need to handle the case - # where one of more of the gradient of the input is not needed. - # We change the unit test framework to explicitly test whether - # the operator correctly handles this through follow codes. - # In addtion, if all the inputs have no gradients, the NOP operator - # will be returned by core.Operator.backward(). The following codes - # do not test this case. - analytic_grads = [] - for name in inputs_to_check: - no_grads = [name for name in no_grad_set] - no_grads.extend(filter(lambda x: x != name, inputs_to_check)) - backward_op = core.Operator.backward(forward_op, set(no_grads)) - # get analytical gradients according to different device - analytic_grads.extend( - self.__get_gradient(forward_op, backward_op, input_vars, - [grad_var_name(name)], place)) + analytic_grads = self.__get_gradient(forward_op, backward_op, + input_vars, check_names, place) self.__assert_is_close(numeric_grads, analytic_grads, check_names, max_relative_error, "Gradient Check On %s" % str(place)) diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py index 81371b1d11..92d2b80e87 100644 --- a/python/paddle/v2/framework/tests/test_mul_op.py +++ b/python/paddle/v2/framework/tests/test_mul_op.py @@ -17,16 +17,33 @@ class TestMulOp(unittest.TestCase): class TestMulGradOp(GradientChecker): - def test_mul(self): - op = create_op("mul") - inputs = { + def setUp(self): + self.op = create_op("mul") + self.inputs = { 'X': np.random.random((32, 84)).astype("float32"), 'Y': np.random.random((84, 100)).astype("float32") } - self.compare_grad(op, inputs) + + def test_normal(self): # mul op will enlarge the relative error self.check_grad( - op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5) + self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5) + + def test_ignore_x(self): + self.check_grad( + self.op, + self.inputs, ["Y"], + "Out", + max_relative_error=0.5, + no_grad_set={"X"}) + + def test_ignore_y(self): + self.check_grad( + self.op, + self.inputs, ["X"], + "Out", + max_relative_error=0.5, + no_grad_set={"Y"}) # TODO(dzh,qijun) : mulgrad test case need transpose feature of blas library diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py index 45d569da29..403734e71a 100644 --- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py +++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py @@ -17,13 +17,21 @@ class TestRowwiseAddOp(unittest.TestCase): class RowwiseAddGradOpTest(GradientChecker): - def test_rowwise_add(self): - op = create_op("rowwise_add") - inputs = { + def setUp(self): + self.op = create_op("rowwise_add") + self.inputs = { "X": np.random.uniform(0.1, 1, [5, 10]).astype("float32"), "b": np.random.uniform(0.1, 1, [10]).astype("float32") } - self.check_grad(op, inputs, set(["X", "b"]), "Out") + + def test_normal(self): + self.check_grad(self.op, self.inputs, ["X", "b"], "Out") + + def test_ignore_b(self): + self.check_grad(self.op, self.inputs, ["X"], "Out", no_grad_set={"b"}) + + def test_ignore_x(self): + self.check_grad(self.op, self.inputs, ["b"], "Out", no_grad_set={"X"}) if __name__ == '__main__': From ab55d7933bd7efbdddebbcee237323505d80244a Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 5 Sep 2017 10:36:46 +0800 Subject: [PATCH 5/6] revert scatter_op and other mirror changes. --- doc/howto/dev/new_op_cn.md | 48 ++++++++++++++----- paddle/operators/mul_op.h | 2 +- paddle/operators/scatter_op.cc | 4 +- paddle/operators/scatter_op.h | 10 ++-- .../paddle/v2/framework/tests/test_mul_op.py | 3 ++ .../v2/framework/tests/test_rowwise_add_op.py | 2 +- 6 files changed, 46 insertions(+), 23 deletions(-) diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md index ec79b7f42b..5c523bf046 100644 --- a/doc/howto/dev/new_op_cn.md +++ b/doc/howto/dev/new_op_cn.md @@ -280,28 +280,50 @@ class TestMulOp(unittest.TestCase): 反向Op单测继承自`GradientChecker`,而`GradientChecker`集成自`unittest.TestCase`,所以反向单测函数需要`test_`开头。 - ``` - class MulGradOpTest(GradientChecker): - def test_mul(self): - op = create_op("mul") - inputs = { +``` +class TestMulGradOp(GradientChecker): + def setUp(self): + self.op = create_op("mul") + self.inputs = { 'X': np.random.random((32, 84)).astype("float32"), 'Y': np.random.random((84, 100)).astype("float32") } - self.compare_grad(op, inputs) + + def test_cpu_gpu_compare(self): + self.compare_grad(self.op, self.inputs) + + def test_normal(self): # mul op will enlarge the relative error self.check_grad( - op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5) - ``` + self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5) + + def test_ignore_x(self): + self.check_grad( + self.op, + self.inputs, ["Y"], + "Out", + max_relative_error=0.5, + no_grad_set={"X"}) + + def test_ignore_y(self): + self.check_grad( + self.op, + self.inputs, ["X"], + "Out", + max_relative_error=0.5, + no_grad_set={"Y"}) +``` + +下面解释一些关键的地方: - 调用`create_op("mul")`创建反向Op对应的前向Op。 - - 定义输入`inputs`。 - 调用`compare_grad`函数对比CPU、GPU计算结果。 - - 调用`check_grad`检查梯度稳定性,这里采用数值法检测梯度正确性。 - - 第一个参数`op` : 前向op。 - - 第二个参数`inputs` : 输入词典,词典的Key和`ProtoMaker`定义保持一致。 - - 第三个参数`set(["X", "Y"])` : 指定对输入变量`X`、`Y`做梯度检测。 + - `test_normal`中调用`check_grad`检查梯度稳定性,这里采用数值法检测梯度正确性。 + - 第一个参数`self.op` : 前向Op。 + - 第二个参数`self.inputs` : 输入词典,词典的Key和`ProtoMaker`定义保持一致。 + - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。 - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out` + - `test_ignore_x`和`test_ignore_y`分支测试只需要计算一个输入梯度的情况。 ### 编译和执行 diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 66ed2f81c7..05a79e13b3 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -54,8 +54,8 @@ class MulGradKernel : public framework::OpKernel { auto* device_context = const_cast(ctx.device_context_); if (dx) { - // dx = dout * y'. dx: M x K, dout : M x N, y : K x N dx->mutable_data(ctx.GetPlace()); + // dx = dout * y'. dx: M x K, dout : M x N, y : K x N math::matmul(*dout, false, *y, true, 1, dx, 0, device_context); } if (dy) { diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc index 9b5068f07c..35c185ad80 100644 --- a/paddle/operators/scatter_op.cc +++ b/paddle/operators/scatter_op.cc @@ -50,8 +50,8 @@ class ScatterGradOp : public framework::OperatorWithKernel { auto *dRef = ctx.Output(framework::GradVarName("Ref")); auto *Ref = ctx.Input("Ref"); - if (dRef) dRef->Resize(Ref->dims()); - if (dUpdates) dUpdates->Resize(Updates->dims()); + dRef->Resize(Ref->dims()); + dUpdates->Resize(Updates->dims()); } }; diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h index 7551480211..e9595638a8 100644 --- a/paddle/operators/scatter_op.h +++ b/paddle/operators/scatter_op.h @@ -49,12 +49,10 @@ class ScatterGradientOpKernel : public framework::OpKernel { auto *dOut = ctx.Input(framework::GradVarName("Out")); // In place gradient: dRef = dO - if (dRef) dRef->ShareDataWith(*dOut); - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates += dO[Index] - Gather(ctx.GetPlace(), dOut, Index, dUpdates); - } + dRef->ShareDataWith(*dOut); + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates += dO[Index] + Gather(ctx.GetPlace(), dOut, Index, dUpdates); } }; diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py index 92d2b80e87..b58e4266d1 100644 --- a/python/paddle/v2/framework/tests/test_mul_op.py +++ b/python/paddle/v2/framework/tests/test_mul_op.py @@ -24,6 +24,9 @@ class TestMulGradOp(GradientChecker): 'Y': np.random.random((84, 100)).astype("float32") } + def test_cpu_gpu_compare(self): + self.compare_grad(self.op, self.inputs) + def test_normal(self): # mul op will enlarge the relative error self.check_grad( diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py index 403734e71a..2ddb85e2e7 100644 --- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py +++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py @@ -16,7 +16,7 @@ class TestRowwiseAddOp(unittest.TestCase): self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])} -class RowwiseAddGradOpTest(GradientChecker): +class TestRowwiseAddGradOp(GradientChecker): def setUp(self): self.op = create_op("rowwise_add") self.inputs = { From cdae0c754ec2f218ad06589fe669ebb00fb52e07 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 5 Sep 2017 15:51:47 +0800 Subject: [PATCH 6/6] fix Conv3d, DeConv3d (bias shape) --- paddle/gserver/layers/Conv3DLayer.cpp | 23 +++++++++++++++++------ paddle/gserver/layers/DeConv3DLayer.cpp | 22 ++++++++++++++++------ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/paddle/gserver/layers/Conv3DLayer.cpp b/paddle/gserver/layers/Conv3DLayer.cpp index 7cc9937cce..3887aa58b2 100644 --- a/paddle/gserver/layers/Conv3DLayer.cpp +++ b/paddle/gserver/layers/Conv3DLayer.cpp @@ -42,10 +42,10 @@ bool Conv3DLayer::init(const LayerMap &layerMap, if (sharedBiases_) { CHECK_EQ((size_t)numFilters_, biasParameter_->getSize()); biases_ = - std::unique_ptr(new Weight(1, numFilters_, biasParameter_)); + std::unique_ptr(new Weight(numFilters_, 1, biasParameter_)); } else { biases_ = - std::unique_ptr(new Weight(1, getSize(), biasParameter_)); + std::unique_ptr(new Weight(getSize(), 1, biasParameter_)); } } return true; @@ -224,20 +224,31 @@ void Conv3DLayer::bpropData(int i) { } void Conv3DLayer::bpropBiases() { + MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(), + 1, + biases_->getWGrad()->getElementCnt(), + false, + useGpu_); MatrixPtr outGradMat = getOutputGrad(); + if (this->sharedBiases_) { - biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f); + biases->collectSharedBias(*outGradMat, 1.0f); } else { - biases_->getWGrad()->collectBias(*outGradMat, 1.0f); + biases->collectBias(*outGradMat, 1.0f); } } void Conv3DLayer::addBias() { MatrixPtr outMat = getOutputValue(); + MatrixPtr bias = Matrix::create(biases_->getW()->getData(), + 1, + biases_->getW()->getElementCnt(), + false, + useGpu_); if (this->sharedBiases_) { - outMat->addSharedBias(*(biases_->getW()), 1.0f); + outMat->addSharedBias(*(bias), 1.0f); } else { - outMat->addBias(*(biases_->getW()), 1.0f); + outMat->addBias(*(bias), 1.0f); } } diff --git a/paddle/gserver/layers/DeConv3DLayer.cpp b/paddle/gserver/layers/DeConv3DLayer.cpp index 7d5c772c89..2838980a97 100644 --- a/paddle/gserver/layers/DeConv3DLayer.cpp +++ b/paddle/gserver/layers/DeConv3DLayer.cpp @@ -42,10 +42,10 @@ bool DeConv3DLayer::init(const LayerMap &layerMap, if (sharedBiases_) { CHECK_EQ((size_t)numFilters_, biasParameter_->getSize()); biases_ = - std::unique_ptr(new Weight(1, numFilters_, biasParameter_)); + std::unique_ptr(new Weight(numFilters_, 1, biasParameter_)); } else { biases_ = - std::unique_ptr(new Weight(1, getSize(), biasParameter_)); + std::unique_ptr(new Weight(getSize(), 1, biasParameter_)); } } return true; @@ -191,21 +191,31 @@ void DeConv3DLayer::bpropWeights(int i) {} void DeConv3DLayer::bpropData(int i) {} void DeConv3DLayer::bpropBiases() { + MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(), + 1, + biases_->getWGrad()->getElementCnt(), + false, + useGpu_); const MatrixPtr &outGradMat = getOutputGrad(); if (this->sharedBiases_) { - biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f); + biases->collectSharedBias(*outGradMat, 1.0f); } else { - biases_->getWGrad()->collectBias(*outGradMat, 1.0f); + biases->collectBias(*outGradMat, 1.0f); } } void DeConv3DLayer::addBias() { MatrixPtr outMat = getOutputValue(); + MatrixPtr bias = Matrix::create(biases_->getW()->getData(), + 1, + biases_->getW()->getElementCnt(), + false, + useGpu_); if (this->sharedBiases_) { - outMat->addSharedBias(*(biases_->getW()), 1.0f); + outMat->addSharedBias(*(bias), 1.0f); } else { - outMat->addBias(*(biases_->getW()), 1.0f); + outMat->addBias(*(bias), 1.0f); } }