diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index bb0146dd0a..3aeba53572 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) -paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) +paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 1a9324ec86..2900221485 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker "(bool, default: false), A flag to indicate whether to interpretate " "the given labels as soft labels.") .SetDefault(false); + AddAttr( + "numeric_stable_mode", + "(bool, default: false), A flag to indicate whether to use more " + "numerically stable algorithm. This flag is only valid when " + "soft_label is false and GPU is used.") + .SetDefault(false); AddAttr( "ignore_index", "(int, default -100), Specifies a target value that is ignored and" diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index a07c17348e..6d48796191 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { @@ -117,8 +118,8 @@ using BlockReduceTempStorage = typename BlockReduce::TempStorage; // Make sure that BlockDim <= feature_size // This kernel is used to calculate the max element of each row template -__global__ void RowReductionForMax(const T* logits_data, T* max_data, - int feature_size) { +static __global__ void RowReductionForMax(const T* logits_data, T* max_data, + int feature_size) { __shared__ BlockReduceTempStorage temp_storage; auto beg_idx = feature_size * blockIdx.x + threadIdx.x; @@ -141,9 +142,10 @@ __global__ void RowReductionForMax(const T* logits_data, T* max_data, } // Make sure that BlockDim <= feature_size -template -__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data, - T* softmax, int feature_size) { +template +static __global__ void RowReductionForDiffMaxSum(const T* logits_data, + T* max_data, T* softmax, + int feature_size) { __shared__ BlockReduceTempStorage temp_storage; auto beg_idx = feature_size * blockIdx.x + threadIdx.x; @@ -153,24 +155,34 @@ __global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data, softmax[beg_idx] = logits_data[beg_idx] - block_max; T diff_max_sum = real_exp(softmax[beg_idx]); - beg_idx += BlockDim; - while (beg_idx < end_idx) { - softmax[beg_idx] = logits_data[beg_idx] - block_max; - diff_max_sum += real_exp(softmax[beg_idx]); - beg_idx += BlockDim; + auto idx = beg_idx + BlockDim; + while (idx < end_idx) { + softmax[idx] = logits_data[idx] - block_max; + diff_max_sum += real_exp(softmax[idx]); + idx += BlockDim; } diff_max_sum = BlockReduce(temp_storage).Reduce(diff_max_sum, cub::Sum()); if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum); + + if (!CalculateLogSoftmax) return; + __syncthreads(); + diff_max_sum = max_data[blockIdx.x]; + softmax[beg_idx] -= diff_max_sum; + beg_idx += BlockDim; + while (beg_idx < end_idx) { + softmax[beg_idx] -= diff_max_sum; + beg_idx += BlockDim; + } + if (threadIdx.x == 0) max_data[blockIdx.x] = 0; } // Make sure that BlockDim <= feature_size template -__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data, - const T* labels_data, - T* loss_data, T* softmax, - int feature_size) { +static __global__ void RowReductionForSoftmaxAndCrossEntropy( + const T* logits_data, const T* labels_data, T* loss_data, T* softmax, + int feature_size) { __shared__ BlockReduceTempStorage temp_storage; auto beg_idx = feature_size * blockIdx.x + threadIdx.x; @@ -194,11 +206,134 @@ __global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data, } template -__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) { +struct HardLabelSoftmaxWithCrossEntropyFunctor { + public: + HardLabelSoftmaxWithCrossEntropyFunctor(const T* logits, + const int64_t* labels, T* loss, + T* log_softmax, int feature_size) + : logits_(logits), + labels_(labels), + loss_(loss), + log_softmax_(log_softmax), + feature_size_(feature_size) {} + + __device__ void operator()(int idx) const { + auto row_idx = idx / feature_size_; + auto col_idx = idx % feature_size_; + if (col_idx != labels_[row_idx]) { + log_softmax_[idx] = real_exp(log_softmax_[idx]); + } else { + auto softmax = log_softmax_[idx]; + log_softmax_[idx] = real_exp(softmax); + loss_[row_idx] = -softmax; + } + } + + private: + const T* logits_; + const int64_t* labels_; + T* loss_; + T* log_softmax_; + int feature_size_; +}; + +template +struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx { + public: + HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const T* logits, + const int64_t* labels, + T* loss, T* log_softmax, + int feature_size, + int ignore_idx) + : logits_(logits), + labels_(labels), + loss_(loss), + log_softmax_(log_softmax), + feature_size_(feature_size), + ignore_idx_(ignore_idx) {} + + __device__ void operator()(int idx) const { + auto row_idx = idx / feature_size_; + auto col_idx = idx % feature_size_; + if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) { + log_softmax_[idx] = real_exp(log_softmax_[idx]); + } else { + auto softmax = log_softmax_[idx]; + log_softmax_[idx] = real_exp(softmax); + loss_[row_idx] = -softmax; + } + } + + private: + const T* logits_; + const int64_t* labels_; + T* loss_; + T* log_softmax_; + int feature_size_; + int ignore_idx_; +}; + +template +static __global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, + int batch_size) { auto idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < batch_size) out[idx] = static_cast(1); } +template +static void HardLabelSoftmaxWithCrossEntropy( + const platform::CUDADeviceContext& ctx, const T* logits_data, + const int64_t* labels_data, T* loss_data, T* softmax_data, int batch_size, + int feature_size, int ignore_idx) { + constexpr int kMaxBlockDim = 512; + int block_dim = feature_size >= kMaxBlockDim + ? kMaxBlockDim + : (1 << static_cast(std::log2(feature_size))); + auto stream = ctx.stream(); + +#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim) \ + case BlockDim: { \ + RowReductionForMax<<>>( \ + logits_data, loss_data, feature_size); \ + RowReductionForDiffMaxSum<<>>( \ + logits_data, loss_data, softmax_data, feature_size); \ + platform::ForRange for_range( \ + ctx, batch_size* feature_size); \ + if (ignore_idx >= 0 && ignore_idx < feature_size) { \ + for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx( \ + logits_data, labels_data, loss_data, softmax_data, feature_size, \ + ignore_idx)); \ + } else { \ + for_range(HardLabelSoftmaxWithCrossEntropyFunctor( \ + logits_data, labels_data, loss_data, softmax_data, feature_size)); \ + } \ + } break + + switch (block_dim) { + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512); + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256); + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128); + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64); + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32); + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16); + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8); + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4); + CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2); + case 1: + SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) / + kMaxBlockDim, + kMaxBlockDim, 0, stream>>>( + softmax_data, batch_size); + cudaMemsetAsync(loss_data, 0, batch_size * sizeof(T), stream); + break; + default: + PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op"); + break; + } +#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL +} + template static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data, const T* labels_data, @@ -237,7 +372,7 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data, kMaxBlockDim, kMaxBlockDim, 0, stream>>>( softmax_data, batch_size); - cudaMemsetAsync(loss_data, 0, batch_size, stream); + cudaMemsetAsync(loss_data, 0, batch_size * sizeof(T), stream); break; default: PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op"); @@ -272,11 +407,21 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { logits_data, labels_data, softmax_data, loss_data, batch_size, feature_size, context.cuda_device_context().stream()); } else { - math::SoftmaxCUDNNFunctor()(context.cuda_device_context(), logits, - softmax); - math::CrossEntropyFunctor()( - context.cuda_device_context(), loss, softmax, labels, false, - ignore_index); + if (!context.Attr("numeric_stable_mode")) { + math::SoftmaxCUDNNFunctor()(context.cuda_device_context(), logits, + softmax); + math::CrossEntropyFunctor()( + context.cuda_device_context(), loss, softmax, labels, false, + ignore_index); + } else { + int batch_size = logits->dims()[0]; + int feature_size = logits->dims()[1]; + auto* logits_data = logits->data(); + auto* labels_data = labels->data(); + HardLabelSoftmaxWithCrossEntropy( + context.cuda_device_context(), logits_data, labels_data, loss_data, + softmax_data, batch_size, feature_size, ignore_index); + } } } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index cdfa26dfe9..b555546cb7 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4713,7 +4713,8 @@ def multiplex(inputs, index): def softmax_with_cross_entropy(logits, label, soft_label=False, - ignore_index=-100): + ignore_index=-100, + numeric_stable_mode=False): """ **Softmax With Cross Entropy Operator.** @@ -4747,6 +4748,18 @@ def softmax_with_cross_entropy(logits, \\left(\\text{logit}_i - \\log\\left(\\sum_{i=0}^{K} \\exp(\\text{logit}_i)\\right)\\right), j = 1,...,K + 3) If numeric_stable_mode is True, softmax is calculated first by: + + .. math:: + + max_j = \\max_{i=0}^{K}{\\text{logit}_i} + + log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j) + + softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j) + + and then cross entropy loss is calculated by softmax and label. + Args: logits (Variable): The unscaled log probabilities, which is a 2-D tensor with shape [N x K]. N is the batch_size, and K is the class number. @@ -4758,6 +4771,13 @@ def softmax_with_cross_entropy(logits, ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient. Only valid if soft_label is set to False. Default: -100 + numeric_stable_mode (bool): A flag to indicate whether to use a more + numerically stable algorithm. Only valid + when soft_label is False and GPU is used. + When soft_label is True or CPU is used, + the algorithm is always numerically stable. + Note that the speed may be slower when use + stable algorithm. Default: False Returns: Variable: The cross entropy loss is a 2-D tensor with shape [N x 1]. @@ -4780,8 +4800,11 @@ def softmax_with_cross_entropy(logits, 'Label': label}, outputs={'Softmax': softmax, 'Loss': loss}, - attrs={'soft_label': soft_label, - 'ignore_index': ignore_index}) + attrs={ + 'soft_label': soft_label, + 'ignore_index': ignore_index, + 'numeric_stable_mode': numeric_stable_mode + }) return loss diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index a18941dd31..37ee880970 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -26,7 +26,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): Test softmax with cross entropy operator with discreate one-hot labels. """ + def initParams(self): + self.numeric_stable_mode = False + def setUp(self): + self.initParams() self.op_type = "softmax_with_cross_entropy" batch_size = 41 class_num = 37 @@ -46,6 +50,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): "Softmax": softmax.astype("float64"), "Loss": cross_entropy.astype("float64") } + self.attrs = {"numeric_stable_mode": self.numeric_stable_mode} def test_check_output(self): self.check_output() @@ -54,6 +59,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): self.check_grad(["Logits"], "Loss") +class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp): + def initParams(self): + self.numeric_stable_mode = True + + class TestSoftmaxWithCrossEntropyOp2(OpTest): """ Test softmax with cross entropy operator with soft labels. @@ -93,7 +103,11 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest): Test softmax with cross entropy operator with ignore_index. """ + def initParams(self): + self.numeric_stable_mode = False + def setUp(self): + self.initParams() self.op_type = "softmax_with_cross_entropy" batch_size = 41 class_num = 37 @@ -114,7 +128,10 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest): "Softmax": softmax.astype("float64"), "Loss": cross_entropy.astype("float64") } - self.attrs = {"ignore_index": ignore_index} + self.attrs = { + "ignore_index": ignore_index, + "numeric_stable_mode": self.numeric_stable_mode + } def test_check_output(self): self.check_output() @@ -123,5 +140,10 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest): self.check_grad(["Logits"], "Loss") +class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3): + def initParams(self): + self.numeric_stable_mode = True + + if __name__ == "__main__": unittest.main()