Revert "PaddingRNN model memory optimize"

test=develop
6 years ago · a91964c8fe
parent 1c6caf8466
commit a91964c8fe
12 changed files with 53 additions and 419 deletions
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@ -27,13 +27,3 @@ REGISTER_OP_CUDA_KERNEL(
    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(cross_entropy2,
                        ops::CrossEntropyOpKernel2<CUDACtx, float>,
                        ops::CrossEntropyOpKernel2<CUDACtx, double>,
                        ops::CrossEntropyOpKernel2<CUDACtx, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2<CUDACtx, float>,
    ops::CrossEntropyGradientOpKernel2<CUDACtx, double>,
    ops::CrossEntropyGradientOpKernel2<CUDACtx, plat::float16>);
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
@ -138,85 +137,5 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
  }
 };
 template <typename T>
 struct HardLabelCrossEntropyBackwardFunctor {
  HardLabelCrossEntropyBackwardFunctor(T* dx, const T* y, const T* dy,
                                       const int64_t* label,
                                       int64_t ignore_index,
                                       int64_t feature_size)
      : dx_(dx),
        y_(y),
        dy_(dy),
        label_(label),
        ignore_index_(ignore_index),
        feature_size_(feature_size) {}
  HOSTDEVICE void operator()(int64_t idx) const {
    auto row_idx = idx / feature_size_;
    auto col_idx = idx % feature_size_;
    auto label = label_[row_idx];
    if (label == col_idx && label != ignore_index_) {
      dx_[idx] = -dy_[row_idx] * real_exp(y_[row_idx]);
    } else {
      dx_[idx] = 0;
    }
  }
  T* dx_;
  const T* y_;
  const T* dy_;
  const int64_t* label_;
  int64_t ignore_index_;
  int64_t feature_size_;
 };
 template <typename DeviceContext, typename T>
 class CrossEntropyOpKernel2 : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* x_original = ctx.Input<Tensor>("X");
    int rank = x_original->dims().size();
    auto x = framework::ReshapeToMatrix(*x_original, rank - 1);
    auto label =
        framework::ReshapeToMatrix(*ctx.Input<Tensor>("Label"), rank - 1);
    auto* y = ctx.Output<Tensor>("Y");
    y->mutable_data<T>(ctx.GetPlace());
    auto ignore_index = ctx.Attr<int>("ignore_index");
    math::CrossEntropyFunctor<DeviceContext, T>()(
        ctx.template device_context<DeviceContext>(), y, &x, &label, false,
        ignore_index);
  }
 };
 template <typename DeviceContext, typename T>
 class CrossEntropyGradientOpKernel2 : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* y = ctx.Input<Tensor>("Y");
    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
    auto* label = ctx.Input<Tensor>("Label");
    auto* p_dx = dx->mutable_data<T>(ctx.GetPlace());
    auto* p_y = y->data<T>();
    auto* p_dy = dy->data<T>();
    auto* p_label = label->data<int64_t>();
    int64_t ignore_index = ctx.Attr<int>("ignore_index");
    int rank = dx->dims().size();
    int64_t feature_size = dx->dims()[rank - 1];
    int64_t batch_size = framework::product(dx->dims()) / feature_size;
    platform::ForRange<DeviceContext> for_range(
        ctx.template device_context<DeviceContext>(),
        batch_size * feature_size);
    for_range(HardLabelCrossEntropyBackwardFunctor<T>(
        p_dx, p_y, p_dy, p_label, ignore_index, feature_size));
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/expand_op.h"
 #include <memory>
 #include <vector>
 namespace paddle {
@ -139,28 +138,12 @@ class ExpandGradOp : public framework::OperatorWithKernel {
  }
 };
 class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
  std::unique_ptr<framework::OpDesc> Apply() const override {
    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
    op->SetType("expand_grad");
    op->SetInput("X", Input("X"));
    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
    op->SetAttrMap(Attrs());
    return op;
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
-                  ops::ExpandGradOpDescMaker);
+                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
--- a/paddle/fluid/operators/math.h
+++ b/paddle/fluid/operators/math.h
@ -1,42 +0,0 @@
 // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "math.h"  // NOLINT
 namespace paddle {
 namespace operators {
 inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
  return static_cast<platform::float16>(::expf(static_cast<float>(x)));
 }
 inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
 inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
 inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
  return static_cast<platform::float16>(::logf(static_cast<float>(x)));
 }
 inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
 inline HOSTDEVICE double real_log(double x) { return ::log(x); }
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@ -21,6 +20,17 @@ namespace paddle {
 namespace operators {
 namespace math {
 namespace {
 __device__ __forceinline__ float real_log(float x) { return logf(x); }
 __device__ __forceinline__ double real_log(double x) { return log(x); }
 __device__ __forceinline__ platform::float16 real_log(
    const platform::float16& val) {
  return static_cast<platform::float16>(logf(static_cast<float>(val)));
 }
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                   const int N, const int D,
@ -51,6 +61,7 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
    Y[blockIdx.x] = -val;
  }
 }
 }  // namespace
 template <typename T>
 class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
--- a/paddle/fluid/operators/selu_op.h
+++ b/paddle/fluid/operators/selu_op.h
@ -15,12 +15,13 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/for_range.h"
 namespace paddle {
 namespace operators {
 static HOSTDEVICE float real_exp(float x) { return expf(x); }
 static HOSTDEVICE float real_exp(double x) { return exp(x); }
 template <typename T>
 struct SeluFunctor {
  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@ -14,7 +14,6 @@ limitations under the License. */
 #include <algorithm>
 #include <cub/cub.cuh>  // NOLINT
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
 namespace paddle {
@ -22,6 +21,9 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 __device__ __forceinline__ float real_exp(float x) { return expf(x); }
 __device__ __forceinline__ double real_exp(double x) { return exp(x); }
 template <typename T, int BlockDim>
 using BlockReduce = cub::BlockReduce<T, BlockDim>;
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "cub/cub.cuh"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
@ -22,6 +21,11 @@ namespace operators {
 using Tensor = framework::Tensor;
 static HOSTDEVICE float real_exp(float x) { return expf(x); }
 static HOSTDEVICE float real_exp(double x) { return exp(x); }
 static HOSTDEVICE float real_log(float x) { return logf(x); }
 static HOSTDEVICE float real_log(double x) { return log(x); }
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -1432,8 +1432,6 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
          predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
          cost = fluid.layers.cross_entropy(input=predict, label=label)
    """
    if not soft_label:
        return cross_entropy2(input, label, ignore_index)
    helper = LayerHelper('cross_entropy', **locals())
    out = helper.create_variable_for_type_inference(dtype=input.dtype)
    helper.append_op(
@ -1446,20 +1444,6 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
    return out
 def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
    helper = LayerHelper('cross_entropy2', **locals())
    out = helper.create_variable_for_type_inference(dtype=input.dtype)
    xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
    helper.append_op(
        type='cross_entropy2',
        inputs={'X': [input],
                'Label': [label]},
        outputs={'Y': [out],
                 'XShape': [xshape]},
        attrs={'ignore_index': ignore_index})
    return out
 def bpr_loss(input, label, name=None):
    """
    Bayesian Personalized Ranking Loss Operator.
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
@ -1,79 +0,0 @@
 # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from op_test import OpTest
 import unittest
 import numpy as np
 import six
 class CrossEntropy2OpTestBase(OpTest):
    def initParameters(self):
        return [32, 64], 'float32', -100
    def calc_output(self, logits, label, ignore_index):
        ret = np.zeros(shape=label.shape, dtype=logits.dtype)
        for idx in six.moves.range(label.shape[0]):
            if label[idx] == ignore_index:
                continue
            ret[idx] = -np.log(logits[idx][label[idx]])
        return ret
    def setUp(self):
        self.shape, self.dtype, self.ignore_index = self.initParameters()
        self.op_type = 'cross_entropy2'
        feature_size = int(self.shape[-1])
        batch_size = int(np.prod(self.shape) / feature_size)
        logits = (np.random.random(size=self.shape) + 1).astype(self.dtype)
        label = np.random.random_integers(
            low=0, high=feature_size - 1,
            size=self.shape[0:-1] + [1]).astype('int64')
        outputs = self.calc_output(
            np.reshape(logits, [batch_size, feature_size]),
            np.reshape(label, [batch_size, 1]), self.ignore_index)
        self.inputs = {'X': logits, 'Label': label}
        self.outputs = {
            'Y': np.reshape(outputs, label.shape),
            'XShape': np.zeros(
                shape=logits.shape, dtype=logits.dtype)
        }
        self.attrs = {'ignore_index': self.ignore_index}
    def test_check_output(self):
        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
        self.check_grad(
            inputs_to_check=['X'],
            output_names=['Y'],
            no_grad_set=['XShape', 'Label'])
 class CrossEntropy2OpTest2(CrossEntropy2OpTestBase):
    def initParameters(self):
        return [32, 64], 'float64', 3
 class CrossEntropy2OpTest3(CrossEntropy2OpTestBase):
    def initParameters(self):
        return [4, 8, 16, 32], 'float32', -100
 class CrossEntropy2OpTest4(CrossEntropy2OpTestBase):
    def initParameters(self):
        return [4, 8, 16, 32], 'float32', 3
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@ -524,8 +524,8 @@ class TestLocalLookupTable(TestDistLookupTableBase):
        ops = [
            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
            'split_selected_rows', 'send', 'sequence_pool_grad',
            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
@ -564,8 +564,8 @@ class TestDistLookupTable(TestDistLookupTableBase):
        ops = [
            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
            'lookup_table_grad', 'split_selected_rows', 'send',
            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
@ -612,8 +612,8 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
        ops = [
            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
            'split_selected_rows', 'send', 'sequence_pool_grad',
            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
@ -652,8 +652,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
        ops = [
            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
            'lookup_table_grad', 'split_selected_rows', 'send',
            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
@ -841,8 +841,8 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
        ops = [
            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
            'split_selected_rows', 'send', 'sequence_pool_grad',
            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',