|
|
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
|
|
|
|
limitations under the License. */
|
|
|
|
limitations under the License. */
|
|
|
|
|
|
|
|
|
|
|
|
#include "hl_base.h"
|
|
|
|
#include "hl_base.h"
|
|
|
|
|
|
|
|
#include "hl_device_functions.cuh"
|
|
|
|
#include "CosSimOp.h"
|
|
|
|
#include "CosSimOp.h"
|
|
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
namespace paddle {
|
|
|
@ -79,7 +80,7 @@ void hlCossim(real* output,
|
|
|
|
|
|
|
|
|
|
|
|
KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
|
|
|
|
KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
|
|
|
|
(output, input1, input2, width, input1_height, input2_height, scale);
|
|
|
|
(output, input1, input2, width, input1_height, input2_height, scale);
|
|
|
|
CHECK_SYNC("hl_cossim failed");
|
|
|
|
CHECK_SYNC("hlCossim failed");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <>
|
|
|
|
template <>
|
|
|
@ -91,7 +92,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix* out_mat,
|
|
|
|
CHECK(in1_mat->useGpu_ == true && in2_mat->useGpu_ == true)
|
|
|
|
CHECK(in1_mat->useGpu_ == true && in2_mat->useGpu_ == true)
|
|
|
|
<< "Matrix type are not GPU";
|
|
|
|
<< "Matrix type are not GPU";
|
|
|
|
|
|
|
|
|
|
|
|
size_t numSamples = out_mat->getHeight();
|
|
|
|
size_t num_samples = out_mat->getHeight();
|
|
|
|
size_t dim = in1_mat->getWidth();
|
|
|
|
size_t dim = in1_mat->getWidth();
|
|
|
|
real* out = out_mat->getData();
|
|
|
|
real* out = out_mat->getData();
|
|
|
|
const real* x = in1_mat->getData();
|
|
|
|
const real* x = in1_mat->getData();
|
|
|
@ -99,4 +100,141 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix* out_mat,
|
|
|
|
hlCossim(out, x, y, dim, in1_mat->getHeight(), in2_mat->getHeight(), scale);
|
|
|
|
hlCossim(out, x, y, dim, in1_mat->getHeight(), in2_mat->getHeight(), scale);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<int block_size>
|
|
|
|
|
|
|
|
__global__ void KeCosSimDerivative(const real* grad,
|
|
|
|
|
|
|
|
const real* output,
|
|
|
|
|
|
|
|
const real* prev_out_x,
|
|
|
|
|
|
|
|
const real* prev_out_y,
|
|
|
|
|
|
|
|
real* prev_grad_x,
|
|
|
|
|
|
|
|
real* prev_grad_y,
|
|
|
|
|
|
|
|
size_t width,
|
|
|
|
|
|
|
|
size_t input1_height,
|
|
|
|
|
|
|
|
size_t input2_height,
|
|
|
|
|
|
|
|
real scale) {
|
|
|
|
|
|
|
|
const int ty = blockIdx.y;
|
|
|
|
|
|
|
|
int tid = threadIdx.x;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__shared__ real xx[block_size];
|
|
|
|
|
|
|
|
__shared__ real yy[block_size];
|
|
|
|
|
|
|
|
__shared__ real xy[block_size];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xx[tid] = 0.0;
|
|
|
|
|
|
|
|
yy[tid] = 0.0;
|
|
|
|
|
|
|
|
xy[tid] = 0.0;
|
|
|
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prev_out_x += ty * width;
|
|
|
|
|
|
|
|
prev_grad_x += ty * width;
|
|
|
|
|
|
|
|
if (input2_height > 1) {
|
|
|
|
|
|
|
|
prev_out_y += ty * width;
|
|
|
|
|
|
|
|
prev_grad_y += ty * width;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int index = tid; index < width; index += block_size) {
|
|
|
|
|
|
|
|
real x = prev_out_x[index];
|
|
|
|
|
|
|
|
real y = prev_out_y[index];
|
|
|
|
|
|
|
|
xx[tid] += x * x;
|
|
|
|
|
|
|
|
yy[tid] += y * y;
|
|
|
|
|
|
|
|
xy[tid] += x * y;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int s = block_size / 2; s > 0; s >>= 1) {
|
|
|
|
|
|
|
|
if (tid < s) {
|
|
|
|
|
|
|
|
xx[tid] += xx[tid + s];
|
|
|
|
|
|
|
|
yy[tid] += yy[tid + s];
|
|
|
|
|
|
|
|
xy[tid] += xy[tid + s];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xy[0] == 0) {
|
|
|
|
|
|
|
|
real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
|
|
|
|
|
|
|
|
for (int index = tid; index < width; index += block_size) {
|
|
|
|
|
|
|
|
prev_grad_x[index] +=
|
|
|
|
|
|
|
|
scale * grad[ty] * prev_out_y[index] * reciprocal;
|
|
|
|
|
|
|
|
if (input2_height > 1) {
|
|
|
|
|
|
|
|
prev_grad_y[index] +=
|
|
|
|
|
|
|
|
scale * grad[ty] * prev_out_x[index] * reciprocal;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
paddle::paddleAtomicAdd(prev_grad_y + index,
|
|
|
|
|
|
|
|
scale * grad[ty] * prev_out_x[index] * reciprocal);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
real reciprocalXY = 1.0 / xy[0];
|
|
|
|
|
|
|
|
real reciprocalSquareSumX = 1.0 / xx[0];
|
|
|
|
|
|
|
|
real reciprocalSquareSumY = 1.0 / yy[0];
|
|
|
|
|
|
|
|
for (int index = tid; index < width; index += block_size) {
|
|
|
|
|
|
|
|
prev_grad_x[index] += output[ty] * grad[ty] *
|
|
|
|
|
|
|
|
(prev_out_y[index] * reciprocalXY -
|
|
|
|
|
|
|
|
prev_out_x[index] * reciprocalSquareSumX);
|
|
|
|
|
|
|
|
if (input2_height > 1) {
|
|
|
|
|
|
|
|
prev_grad_y[index] += output[ty] * grad[ty] *
|
|
|
|
|
|
|
|
(prev_out_x[index] * reciprocalXY -
|
|
|
|
|
|
|
|
prev_out_y[index] * reciprocalSquareSumY);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
|
|
|
|
|
|
|
|
(prev_out_x[index] * reciprocalXY -
|
|
|
|
|
|
|
|
prev_out_y[index] * reciprocalSquareSumY));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void hlCossimDerivative(const real* grad,
|
|
|
|
|
|
|
|
const real* output,
|
|
|
|
|
|
|
|
const real* prev_out_x,
|
|
|
|
|
|
|
|
const real* prev_out_y,
|
|
|
|
|
|
|
|
real* prev_grad_x,
|
|
|
|
|
|
|
|
real* prev_grad_y,
|
|
|
|
|
|
|
|
size_t width,
|
|
|
|
|
|
|
|
size_t input1_height,
|
|
|
|
|
|
|
|
size_t input2_height,
|
|
|
|
|
|
|
|
real scale) {
|
|
|
|
|
|
|
|
CHECK_NOTNULL(grad);
|
|
|
|
|
|
|
|
CHECK_NOTNULL(output);
|
|
|
|
|
|
|
|
CHECK_NOTNULL(prev_out_x);
|
|
|
|
|
|
|
|
CHECK_NOTNULL(prev_out_y);
|
|
|
|
|
|
|
|
CHECK_NOTNULL(prev_grad_x);
|
|
|
|
|
|
|
|
CHECK_NOTNULL(prev_grad_y);
|
|
|
|
|
|
|
|
const int block_size = 256;
|
|
|
|
|
|
|
|
dim3 threads(block_size, 1);
|
|
|
|
|
|
|
|
dim3 grid(1, input1_height);
|
|
|
|
|
|
|
|
KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
|
|
|
|
|
|
|
|
(grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
|
|
|
|
|
|
|
|
input1_height, input2_height, scale);
|
|
|
|
|
|
|
|
CHECK_SYNC("hlCossimDerivate failed");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <>
|
|
|
|
|
|
|
|
void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix* out_grad,
|
|
|
|
|
|
|
|
const GpuMatrix* out_val,
|
|
|
|
|
|
|
|
const GpuMatrix* in1_val,
|
|
|
|
|
|
|
|
const GpuMatrix* in2_val,
|
|
|
|
|
|
|
|
GpuMatrix* in1_grad,
|
|
|
|
|
|
|
|
GpuMatrix* in2_grad,
|
|
|
|
|
|
|
|
real scale) {
|
|
|
|
|
|
|
|
CHECK(out_grad && out_val && in1_val && in2_val && in1_grad && in2_grad);
|
|
|
|
|
|
|
|
CHECK(out_grad->useGpu_ && out_val->useGpu_ && in1_val->useGpu_
|
|
|
|
|
|
|
|
&& in2_val->useGpu_ && in1_grad->useGpu_ && in2_grad->useGpu_)
|
|
|
|
|
|
|
|
<< "Matrix types are not equally GPU";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t dim = in1_val->getWidth();
|
|
|
|
|
|
|
|
const real* grad = out_grad->getData();
|
|
|
|
|
|
|
|
const real* out = out_val->getData();
|
|
|
|
|
|
|
|
const real* prev_out_x = in1_val->getData();
|
|
|
|
|
|
|
|
const real* prev_out_y = in2_val->getData();
|
|
|
|
|
|
|
|
real* prev_grad_x = in1_grad->getData();
|
|
|
|
|
|
|
|
real* prev_grad_y = in2_grad->getData();
|
|
|
|
|
|
|
|
hlCossimDerivative(grad,
|
|
|
|
|
|
|
|
out,
|
|
|
|
|
|
|
|
prev_out_x,
|
|
|
|
|
|
|
|
prev_out_y,
|
|
|
|
|
|
|
|
prev_grad_x,
|
|
|
|
|
|
|
|
prev_grad_y,
|
|
|
|
|
|
|
|
dim,
|
|
|
|
|
|
|
|
in1_val->getHeight(),
|
|
|
|
|
|
|
|
in2_val->getHeight(),
|
|
|
|
|
|
|
|
scale);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
} // namespace paddle
|
|
|
|
} // namespace paddle
|
|
|
|