You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
143 lines
4.3 KiB
143 lines
4.3 KiB
/**
|
|
* TensorAssign.h
|
|
*
|
|
* Author: hedaoyuan (hedaoyuan@baidu.com)
|
|
* Created on: 2016-10-08
|
|
*
|
|
* Copyright (c) Baidu.com, Inc. All Rights Reserved
|
|
*
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include "paddle/utils/Logging.h"
|
|
|
|
namespace paddle {
|
|
|
|
template<typename LhsType, typename RhsType, class T>
|
|
class TensorAssignOp {
|
|
public:
|
|
explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
|
|
: lhs_(lhs), rhs_(rhs) {
|
|
#ifndef __CUDA_ARCH__
|
|
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
|
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
|
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
|
#endif
|
|
}
|
|
|
|
INLINE void apply(const int i, const int j) {
|
|
lhs_.applyRef(i, j) = rhs_.apply(i, j);
|
|
}
|
|
INLINE void apply(const int index) {
|
|
lhs_.applyRef(index) = rhs_.apply(index);
|
|
}
|
|
|
|
INLINE size_t getWidth() const { return lhs_.getWidth(); }
|
|
INLINE size_t getHeight() const { return rhs_.getHeight(); }
|
|
INLINE bool isContiguous() const {
|
|
return lhs_.isContiguous() && rhs_.isContiguous();
|
|
}
|
|
INLINE bool useGpu() const { return lhs_.useGpu(); }
|
|
|
|
private:
|
|
TensorApply<LhsType, T> lhs_;
|
|
TensorApply<const RhsType, T> rhs_;
|
|
};
|
|
|
|
template <typename Assign, typename... AssignOp>
|
|
void AssignCpuEvaluate(int height, int width, bool isContiguous,
|
|
Assign&& assign, AssignOp&& ... args) {
|
|
if (isContiguous) {
|
|
int size = height * width;
|
|
for (int index = 0; index < size; index++) {
|
|
assign.apply(index);
|
|
__attribute__((unused)) int dummy[] = { (((args)).apply(index), 0)... };
|
|
}
|
|
} else {
|
|
for (int i = 0; i < height; i++) {
|
|
for (int j = 0; j < width; j++) {
|
|
assign.apply(i, j);
|
|
__attribute__((unused)) int dummy[] = { (((args)).apply(i, j), 0)... };
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef __NVCC__
|
|
template <typename Assign, typename... AssignOp>
|
|
__global__
|
|
void AssignGpuEvaluate1(const int border, Assign assign, AssignOp ... args) {
|
|
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < border) {
|
|
assign.apply(idx);
|
|
__attribute__((unused)) int dummy[] = { (((args)).apply(idx), 0)... };
|
|
}
|
|
}
|
|
|
|
template <typename Assign, typename... AssignOp>
|
|
__global__
|
|
void AssignGpuEvaluate2(const int height, const int width,
|
|
Assign assign, AssignOp ... args) {
|
|
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
|
|
for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
|
|
for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
|
|
assign.apply(i, j);
|
|
__attribute__((unused)) int dummy[] = { (((args)).apply(i, j), 0)... };
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// At least one assignment expression is required
|
|
template <typename Assign, typename... AssignOp>
|
|
void AssignEvaluate(Assign&& assign, AssignOp&& ... args) {
|
|
const bool useGpu_ = assign.useGpu();
|
|
bool isContiguous_ = assign.isContiguous();
|
|
const size_t height = assign.getHeight();
|
|
const size_t width = assign.getWidth();
|
|
|
|
const int packSize = sizeof...(args);
|
|
const bool packUseGpu[] = { ((args)).useGpu()... };
|
|
const bool packIsContiguous[] = { ((args)).isContiguous()... };
|
|
const size_t packHeight[] = { ((args)).getHeight()... };
|
|
const size_t packWidth[] = { ((args)).getWidth()... };
|
|
|
|
for (int i = 0; i < packSize; i++) {
|
|
CHECK_EQ(useGpu_, packUseGpu[i]);
|
|
CHECK_EQ(height, packHeight[i]);
|
|
CHECK_EQ(width, packWidth[i]);
|
|
isContiguous_ = isContiguous_ && packIsContiguous[i];
|
|
}
|
|
|
|
if (useGpu_) {
|
|
#ifdef __NVCC__
|
|
if (isContiguous_) {
|
|
int size = height * width;
|
|
int blockSize = size <= 1024 ? size : 1024;
|
|
int gridSize = (size + 1024 - 1) / 1024;
|
|
AssignGpuEvaluate1
|
|
<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(size, assign, args...);
|
|
} else {
|
|
int blockSizeY = std::min(32, (int)height);
|
|
int blockSizeX = (32 / blockSizeY) * 32;
|
|
int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
|
|
int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
|
|
dim3 threads(blockSizeX, blockSizeY);
|
|
dim3 grid(gridSizeX, gridSizeY);
|
|
AssignGpuEvaluate2
|
|
<<<grid, threads, 0, STREAM_DEFAULT>>>(height, width, assign, args...);
|
|
}
|
|
|
|
CHECK_SYNC("AssignEvaluate failed");
|
|
#endif
|
|
} else {
|
|
AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
|
|
}
|
|
}
|
|
|
|
} // namespace paddle
|
|
|