You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/paddle/math/TensorAssign.h

143 lines
4.3 KiB

/**
* TensorAssign.h
*
* Author: hedaoyuan (hedaoyuan@baidu.com)
* Created on: 2016-10-08
*
* Copyright (c) Baidu.com, Inc. All Rights Reserved
*
*/
#pragma once
#include <algorithm>
#include "paddle/utils/Logging.h"
namespace paddle {
template<typename LhsType, typename RhsType, class T>
class TensorAssignOp {
public:
explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
: lhs_(lhs), rhs_(rhs) {
#ifndef __CUDA_ARCH__
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
#endif
}
INLINE void apply(const int i, const int j) {
lhs_.applyRef(i, j) = rhs_.apply(i, j);
}
INLINE void apply(const int index) {
lhs_.applyRef(index) = rhs_.apply(index);
}
INLINE size_t getWidth() const { return lhs_.getWidth(); }
INLINE size_t getHeight() const { return rhs_.getHeight(); }
INLINE bool isContiguous() const {
return lhs_.isContiguous() && rhs_.isContiguous();
}
INLINE bool useGpu() const { return lhs_.useGpu(); }
private:
TensorApply<LhsType, T> lhs_;
TensorApply<const RhsType, T> rhs_;
};
template <typename Assign, typename... AssignOp>
void AssignCpuEvaluate(int height, int width, bool isContiguous,
Assign&& assign, AssignOp&& ... args) {
if (isContiguous) {
int size = height * width;
for (int index = 0; index < size; index++) {
assign.apply(index);
__attribute__((unused)) int dummy[] = { (((args)).apply(index), 0)... };
}
} else {
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
assign.apply(i, j);
__attribute__((unused)) int dummy[] = { (((args)).apply(i, j), 0)... };
}
}
}
}
#ifdef __NVCC__
template <typename Assign, typename... AssignOp>
__global__
void AssignGpuEvaluate1(const int border, Assign assign, AssignOp ... args) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < border) {
assign.apply(idx);
__attribute__((unused)) int dummy[] = { (((args)).apply(idx), 0)... };
}
}
template <typename Assign, typename... AssignOp>
__global__
void AssignGpuEvaluate2(const int height, const int width,
Assign assign, AssignOp ... args) {
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
assign.apply(i, j);
__attribute__((unused)) int dummy[] = { (((args)).apply(i, j), 0)... };
}
}
}
#endif
// At least one assignment expression is required
template <typename Assign, typename... AssignOp>
void AssignEvaluate(Assign&& assign, AssignOp&& ... args) {
const bool useGpu_ = assign.useGpu();
bool isContiguous_ = assign.isContiguous();
const size_t height = assign.getHeight();
const size_t width = assign.getWidth();
const int packSize = sizeof...(args);
const bool packUseGpu[] = { ((args)).useGpu()... };
const bool packIsContiguous[] = { ((args)).isContiguous()... };
const size_t packHeight[] = { ((args)).getHeight()... };
const size_t packWidth[] = { ((args)).getWidth()... };
for (int i = 0; i < packSize; i++) {
CHECK_EQ(useGpu_, packUseGpu[i]);
CHECK_EQ(height, packHeight[i]);
CHECK_EQ(width, packWidth[i]);
isContiguous_ = isContiguous_ && packIsContiguous[i];
}
if (useGpu_) {
#ifdef __NVCC__
if (isContiguous_) {
int size = height * width;
int blockSize = size <= 1024 ? size : 1024;
int gridSize = (size + 1024 - 1) / 1024;
AssignGpuEvaluate1
<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(size, assign, args...);
} else {
int blockSizeY = std::min(32, (int)height);
int blockSizeX = (32 / blockSizeY) * 32;
int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
dim3 threads(blockSizeX, blockSizeY);
dim3 grid(gridSizeX, gridSizeY);
AssignGpuEvaluate2
<<<grid, threads, 0, STREAM_DEFAULT>>>(height, width, assign, args...);
}
CHECK_SYNC("AssignEvaluate failed");
#endif
} else {
AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
}
}
} // namespace paddle