|
|
|
@ -1265,69 +1265,6 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
|
|
|
|
|
outGrad.getStride());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void GpuMatrix::crossMapNormalFwd(Matrix& input,
|
|
|
|
|
size_t imgSizeH,
|
|
|
|
|
size_t imgSizeW,
|
|
|
|
|
Matrix& denoms,
|
|
|
|
|
size_t channels,
|
|
|
|
|
size_t sizeX,
|
|
|
|
|
float scale,
|
|
|
|
|
float pow) {
|
|
|
|
|
size_t num = input.getHeight();
|
|
|
|
|
size_t height = imgSizeH;
|
|
|
|
|
size_t width = imgSizeW;
|
|
|
|
|
|
|
|
|
|
CHECK(height * width * channels == input.getWidth());
|
|
|
|
|
CHECK(denoms.getHeight() == input.getHeight() &&
|
|
|
|
|
denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
|
|
|
|
|
input.getWidth() == width_);
|
|
|
|
|
hl_CMRNorm_forward(num,
|
|
|
|
|
input.getData(),
|
|
|
|
|
denoms.getData(),
|
|
|
|
|
data_,
|
|
|
|
|
channels,
|
|
|
|
|
height,
|
|
|
|
|
width,
|
|
|
|
|
sizeX,
|
|
|
|
|
scale,
|
|
|
|
|
-pow);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void GpuMatrix::crossMapNormalBwd(Matrix& localGrad,
|
|
|
|
|
Matrix& denoms,
|
|
|
|
|
Matrix& preOutV,
|
|
|
|
|
Matrix& localOutV,
|
|
|
|
|
size_t channels,
|
|
|
|
|
size_t imgSizeH,
|
|
|
|
|
size_t imgSizeW,
|
|
|
|
|
size_t sizeX,
|
|
|
|
|
float scale,
|
|
|
|
|
float pow) {
|
|
|
|
|
size_t num = preOutV.getHeight();
|
|
|
|
|
size_t height = imgSizeH;
|
|
|
|
|
size_t width = imgSizeW;
|
|
|
|
|
|
|
|
|
|
CHECK(width * height * channels == preOutV.getWidth());
|
|
|
|
|
CHECK(denoms.getHeight() == preOutV.getHeight() &&
|
|
|
|
|
denoms.getWidth() == preOutV.getWidth() &&
|
|
|
|
|
preOutV.getHeight() == height_ && preOutV.getWidth() == width_);
|
|
|
|
|
CHECK(denoms.getHeight() == localGrad.getHeight() &&
|
|
|
|
|
denoms.getWidth() == localGrad.getWidth());
|
|
|
|
|
|
|
|
|
|
hl_CMRNorm_backward(num,
|
|
|
|
|
preOutV.getData(),
|
|
|
|
|
denoms.getData(),
|
|
|
|
|
localOutV.getData(),
|
|
|
|
|
localGrad.getData(),
|
|
|
|
|
data_,
|
|
|
|
|
channels,
|
|
|
|
|
height,
|
|
|
|
|
width,
|
|
|
|
|
sizeX,
|
|
|
|
|
-pow,
|
|
|
|
|
2.0f * pow * scale);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void GpuMatrix::maxSequenceForward(Matrix& input,
|
|
|
|
|
const IVector& sequence,
|
|
|
|
|
IVector& index) {
|
|
|
|
@ -2219,119 +2156,6 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CpuMatrix::crossMapNormalFwd(Matrix& input,
|
|
|
|
|
size_t imgSizeH,
|
|
|
|
|
size_t imgSizeW,
|
|
|
|
|
Matrix& denoms,
|
|
|
|
|
size_t channels,
|
|
|
|
|
size_t sizeX,
|
|
|
|
|
float scale,
|
|
|
|
|
float pow) {
|
|
|
|
|
CHECK(isContiguous());
|
|
|
|
|
CHECK(input.isContiguous());
|
|
|
|
|
CHECK(denoms.isContiguous());
|
|
|
|
|
CHECK_EQ(getHeight(), input.getHeight());
|
|
|
|
|
CHECK_EQ(getWidth(), input.getWidth());
|
|
|
|
|
CHECK_EQ(getHeight(), denoms.getHeight());
|
|
|
|
|
CHECK_EQ(getWidth(), denoms.getWidth());
|
|
|
|
|
|
|
|
|
|
size_t numSample = input.getHeight();
|
|
|
|
|
size_t numCols = input.getWidth();
|
|
|
|
|
size_t height = imgSizeH;
|
|
|
|
|
size_t width = imgSizeW;
|
|
|
|
|
CHECK(height * width * channels == numCols);
|
|
|
|
|
|
|
|
|
|
// TODO(hedaoyuan) After commit TensorExpress code,
|
|
|
|
|
// Reconstruction this code to remove the temporary memory.
|
|
|
|
|
CpuMatrix tmp(channels, height * width);
|
|
|
|
|
CpuMatrix tmp2(tmp.getData(), 1, channels * height * width);
|
|
|
|
|
denoms.zero();
|
|
|
|
|
const int start = -((int)sizeX - 1) / 2;
|
|
|
|
|
const int end = (int)sizeX + start;
|
|
|
|
|
for (size_t i = 0; i < numSample; i++) {
|
|
|
|
|
input.subMatrix(i, 1)->square2(tmp2);
|
|
|
|
|
CpuMatrix subDen(
|
|
|
|
|
denoms.subMatrix(i, 1)->getData(), channels, height * width);
|
|
|
|
|
for (int c = 0; c < (int)channels; c++) {
|
|
|
|
|
for (int s = start; s < end; s++) {
|
|
|
|
|
if (c + s >= 0 && c + s < (int)channels) {
|
|
|
|
|
subDen.subMatrix(c, 1)->add(*tmp.subMatrix(c + s, 1));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
denoms.add(scale, (real)1);
|
|
|
|
|
this->pow2(denoms, -pow);
|
|
|
|
|
this->dotMul(input);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CpuMatrix::crossMapNormalBwd(Matrix& localGrad,
|
|
|
|
|
Matrix& denoms,
|
|
|
|
|
Matrix& preOutV,
|
|
|
|
|
Matrix& localOutV,
|
|
|
|
|
size_t channels,
|
|
|
|
|
size_t imgSizeH,
|
|
|
|
|
size_t imgSizeW,
|
|
|
|
|
size_t sizeX,
|
|
|
|
|
float scale,
|
|
|
|
|
float pow) {
|
|
|
|
|
CHECK(isContiguous());
|
|
|
|
|
CHECK(localGrad.isContiguous());
|
|
|
|
|
CHECK(denoms.isContiguous());
|
|
|
|
|
CHECK(preOutV.isContiguous());
|
|
|
|
|
CHECK(localOutV.isContiguous());
|
|
|
|
|
CHECK_EQ(getHeight(), localGrad.getHeight());
|
|
|
|
|
CHECK_EQ(getWidth(), localGrad.getWidth());
|
|
|
|
|
CHECK_EQ(getHeight(), denoms.getHeight());
|
|
|
|
|
CHECK_EQ(getWidth(), denoms.getWidth());
|
|
|
|
|
CHECK_EQ(getHeight(), preOutV.getHeight());
|
|
|
|
|
CHECK_EQ(getWidth(), preOutV.getWidth());
|
|
|
|
|
CHECK_EQ(getHeight(), localOutV.getHeight());
|
|
|
|
|
CHECK_EQ(getWidth(), localOutV.getWidth());
|
|
|
|
|
|
|
|
|
|
size_t numSample = getHeight();
|
|
|
|
|
size_t numCols = getWidth();
|
|
|
|
|
size_t height = imgSizeH;
|
|
|
|
|
size_t width = imgSizeW;
|
|
|
|
|
CHECK(height * width * channels == numCols);
|
|
|
|
|
|
|
|
|
|
// TODO(hedaoyuan) After commit TensorExpress code,
|
|
|
|
|
// Reconstruction this code to remove the temporary memory.
|
|
|
|
|
CpuMatrix tmp(1, height * width);
|
|
|
|
|
|
|
|
|
|
const int start = -((int)sizeX) / 2;
|
|
|
|
|
const int end = (int)sizeX + start;
|
|
|
|
|
const real ratio = -(real)2 * scale * pow;
|
|
|
|
|
for (size_t i = 0; i < numSample; i++) {
|
|
|
|
|
CpuMatrix inputDiff(
|
|
|
|
|
this->subMatrix(i, 1)->getData(), channels, height * width);
|
|
|
|
|
CpuMatrix outDiff(
|
|
|
|
|
localGrad.subMatrix(i, 1)->getData(), channels, height * width);
|
|
|
|
|
CpuMatrix input(
|
|
|
|
|
preOutV.subMatrix(i, 1)->getData(), channels, height * width);
|
|
|
|
|
CpuMatrix output(
|
|
|
|
|
localOutV.subMatrix(i, 1)->getData(), channels, height * width);
|
|
|
|
|
CpuMatrix subDen(
|
|
|
|
|
denoms.subMatrix(i, 1)->getData(), channels, height * width);
|
|
|
|
|
|
|
|
|
|
for (int c = 0; c < (int)channels; c++) {
|
|
|
|
|
tmp.pow2(*subDen.subMatrix(c, 1), -pow);
|
|
|
|
|
inputDiff.subMatrix(c, 1)
|
|
|
|
|
->addDotMul(tmp, *outDiff.subMatrix(c, 1), (real)1, (real)1);
|
|
|
|
|
for (int s = start; s < end; s++) {
|
|
|
|
|
if (c + s >= 0 && c + s < (int)channels) {
|
|
|
|
|
tmp.dotMul(*outDiff.subMatrix(c + s, 1), *output.subMatrix(c + s, 1));
|
|
|
|
|
tmp.mulScalar(ratio);
|
|
|
|
|
tmp.dotDiv(tmp, *subDen.subMatrix(c + s, 1));
|
|
|
|
|
tmp.dotMul(*input.subMatrix(c, 1));
|
|
|
|
|
inputDiff.subMatrix(c, 1)->add(tmp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Input: one or more sequences. Each sequence contains some instances.
|
|
|
|
|
* Output: output size is the number of input sequences (NOT input instances).
|
|
|
|
|