update histogram op for performance optimization, test=develop (#24912)

my_2.0rc
Qi Li 5 years ago committed by GitHub
parent 4d5ddbf106
commit f373269df0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/histogram_op.h" #include "paddle/fluid/operators/histogram_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
@ -32,28 +30,38 @@ inline int GET_BLOCKS(const int N) {
} }
template <typename T, typename IndexType> template <typename T, typename IndexType>
__device__ static IndexType GetBin(T bVal, T minvalue, T maxvalue, __device__ static IndexType GetBin(T input_value, T min_value, T max_value,
int64_t nbins) { int64_t nbins) {
IndexType bin = IndexType bin = static_cast<int>((input_value - min_value) * nbins /
static_cast<int>((bVal - minvalue) * nbins / (maxvalue - minvalue)); (max_value - min_value));
if (bin == nbins) bin -= 1; IndexType output_index = bin < nbins - 1 ? bin : nbins - 1;
return bin; return output_index;
} }
template <typename T, typename IndexType> template <typename T, typename IndexType>
__global__ void KernelHistogram(const T* input, const int totalElements, __global__ void KernelHistogram(const T* input, const int total_elements,
const int64_t nbins, const T minvalue, const int64_t nbins, const T min_value,
const T maxvalue, int64_t* output) { const T max_value, int64_t* output) {
CUDA_KERNEL_LOOP(linearIndex, totalElements) { extern __shared__ int64_t buf_hist[];
const IndexType inputIdx = threadIdx.x + blockIdx.x * blockDim.x; for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
const auto inputVal = input[inputIdx]; buf_hist[i] = 0;
if (inputVal >= minvalue && inputVal <= maxvalue) { }
const IndexType bin = __syncthreads();
GetBin<T, IndexType>(inputVal, minvalue, maxvalue, nbins);
const IndexType outputIdx = bin < nbins - 1 ? bin : nbins - 1; CUDA_KERNEL_LOOP(input_index, total_elements) {
paddle::platform::CudaAtomicAdd(&output[outputIdx], 1); // const IndexType input_index = threadIdx.x + blockIdx.x * blockDim.x;
const auto input_value = input[input_index];
if (input_value >= min_value && input_value <= max_value) {
const IndexType output_index =
GetBin<T, IndexType>(input_value, min_value, max_value, nbins);
paddle::platform::CudaAtomicAdd(&buf_hist[output_index], 1);
} }
} }
__syncthreads();
for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
paddle::platform::CudaAtomicAdd(&output[i], buf_hist[i]);
}
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
@ -125,8 +133,9 @@ class HistogramCUDAKernel : public framework::OpKernel<T> {
auto stream = auto stream =
context.template device_context<platform::CUDADeviceContext>().stream(); context.template device_context<platform::CUDADeviceContext>().stream();
KernelHistogram<T, IndexType><<<GET_BLOCKS(input_numel), KernelHistogram<
PADDLE_CUDA_NUM_THREADS, 0, stream>>>( T, IndexType><<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS,
nbins * sizeof(int64_t), stream>>>(
input_data, input_numel, nbins, output_min, output_max, out_data); input_data, input_numel, nbins, output_min, output_max, out_data);
} }
}; };

@ -58,12 +58,66 @@ class TestHistogramOpAPI(unittest.TestCase):
msg='histogram output is wrong, out =' + str(actual.numpy())) msg='histogram output is wrong, out =' + str(actual.numpy()))
class TestHistogramOpError(unittest.TestCase):
"""Test histogram op error."""
def run_network(self, net_func):
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
net_func()
exe = fluid.Executor()
exe.run(main_program)
def test_bins_error(self):
"""Test bins should be greater than or equal to 1."""
def net_func():
input_value = paddle.fill_constant(
shape=[3, 4], dtype='float32', value=3.0)
paddle.histogram(input=input_value, bins=-1, min=1, max=5)
with self.assertRaises(fluid.core.EnforceNotMet):
self.run_network(net_func)
def test_min_max_error(self):
"""Test max must be larger or equal to min."""
def net_func():
input_value = paddle.fill_constant(
shape=[3, 4], dtype='float32', value=3.0)
paddle.histogram(input=input_value, bins=1, min=5, max=1)
with self.assertRaises(fluid.core.EnforceNotMet):
self.run_network(net_func)
def test_min_max_range_error(self):
"""Test range of min, max is not finite"""
def net_func():
input_value = paddle.fill_constant(
shape=[3, 4], dtype='float32', value=3.0)
paddle.histogram(input=input_value, bins=1, min=-np.inf, max=5)
with self.assertRaises(fluid.core.EnforceNotMet):
self.run_network(net_func)
def test_type_errors(self):
with program_guard(Program()):
# The input type must be Variable.
self.assertRaises(
TypeError, paddle.histogram, 1, bins=5, min=1, max=5)
# The input type must be 'int32', 'int64', 'float32', 'float64'
x_bool = fluid.data(name='x_bool', shape=[4, 3], dtype='bool')
self.assertRaises(
TypeError, paddle.histogram, x_bool, bins=5, min=1, max=5)
class TestHistogramOp(OpTest): class TestHistogramOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "histogram" self.op_type = "histogram"
self.init_test_case() self.init_test_case()
np_input = np.random.randint( np_input = np.random.uniform(low=0.0, high=20.0, size=self.in_shape)
low=0, high=20, size=self.in_shape, dtype=np.int64)
self.inputs = {"X": np_input} self.inputs = {"X": np_input}
self.init_attrs() self.init_attrs()
Out, _ = np.histogram( Out, _ = np.histogram(

@ -862,41 +862,23 @@ def histogram(input, bins=100, min=0, max=0):
If min and max are both zero, the minimum and maximum values of the data are used. If min and max are both zero, the minimum and maximum values of the data are used.
Args: Args:
input (Variable): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor input (Tensor): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor
should be float32, float64, int32, int64. should be float32, float64, int32, int64.
bins (int): number of histogram bins bins (int): number of histogram bins
min (int): lower end of the range (inclusive) min (int): lower end of the range (inclusive)
max (int): upper end of the range (inclusive) max (int): upper end of the range (inclusive)
Returns: Returns:
Variable: Tensor or LoDTensor calculated by histogram layer. The data type is int64. Tensor: data type is int64, shape is (nbins,).
Code Example 1: Examples:
.. code-block:: python
import paddle
import numpy as np
startup_program = paddle.static.Program()
train_program = paddle.static.Program()
with paddle.static.program_guard(train_program, startup_program):
inputs = paddle.data(name='input', dtype='int32', shape=[2,3])
output = paddle.histogram(inputs, bins=5, min=1, max=5)
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_program)
img = np.array([[2, 4, 2], [2, 5, 4]]).astype(np.int32)
res = exe.run(train_program,
feed={'input': img},
fetch_list=[output])
print(np.array(res[0])) # [0,3,0,2,1]
Code Example 2:
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.disable_static(paddle.CPUPlace())
inputs = paddle.to_tensor([1, 2, 1]) inputs = paddle.to_tensor([1, 2, 1])
result = paddle.histogram(inputs, bins=4, min=0, max=3) result = paddle.histogram(inputs, bins=4, min=0, max=3)
print(result) # [0, 2, 1, 0] print(result) # [0, 2, 1, 0]
paddle.enable_static()
""" """
if in_dygraph_mode(): if in_dygraph_mode():
return core.ops.histogram(input, "bins", bins, "min", min, "max", max) return core.ops.histogram(input, "bins", bins, "min", min, "max", max)

Loading…
Cancel
Save