clean unused code.

8 years ago · a948eea3ed
parent 58827e3e96
commit a948eea3ed
13 changed files with 64 additions and 557 deletions
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@ -188,48 +188,6 @@ extern void hl_param_relu_backward_diff(real* grad_o,
                                        int width,
                                        int height,
                                        int partial_sum);
-/**
- * @brief cos sim forward
- *
- * @param[out]    output         output data
- * @param[in]     input1         input1 data(matrix)
- * @param[in]     input2         input2 data(matrix or vector)
- * @param[in]     width          matrix width
- * @param[in]     input1_height  input1_height
- * @param[in]     input2_height  input2_height
- * @param[in]     scale          scale factor
- */
-extern void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale);
-/**
- * @brief cos sim derivate
- *
- * @param[in]     grad             output grad
- * @param[in]     output           output data
- * @param[in]     prevOutX         input1 data
- * @param[in]     prevOutY         input2 data
- * @param[out]    prevGradX        input1 grad
- * @param[out]    prevGradY        input2 grad
- * @param[in]     width            matrix width
- * @param[in]     input1_height    input1 height
- * @param[in]     input2_height    input2 height
- * @param[in]     scale            scale factor
- */
-extern void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale);

 /**
 * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@ -74,25 +74,6 @@ inline void hl_param_relu_backward_diff(real* grad_o,
                                        int height,
                                        int partial_sum) {}

-inline void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale) {}
-
-inline void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale) {}
-
 inline void hl_matrix_add_shared_bias(real* A_d,
                                      real* B_d,
                                      const int channel,
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@ -584,177 +584,6 @@ void hl_param_relu_backward_diff(real* grad_o,
  CHECK_SYNC("hl_param_relu_backward_diff failed");
 }

-template<int blockSize>
-__global__ void KeCosSim(real* output,
-                         real* input1,
-                         real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hl_cossim(real* output,
-               real* input1,
-               real* input2,
-               int width,
-               int input1_height,
-               int input2_height,
-               real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim failed");
-}
-
-template<int blockSize>
-__global__ void KeCosSimDerivative(real* grad,
-                                   real* output,
-                                   real* prevOutX,
-                                   real* prevOutY,
-                                   real* prevGradX,
-                                   real* prevGradY,
-                                   int width,
-                                   int input1_height,
-                                   int input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prevOutX += ty * width;
-  prevGradX += ty * width;
-  if (input2_height > 1) {
-    prevOutY += ty * width;
-    prevGradY += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = prevOutX[index];
-    real y = prevOutY[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] +=
-        scale * grad[ty] * prevOutY[index] * reciprocal;
-      if (input2_height > 1) {
-        prevGradY[index] +=
-          scale * grad[ty] * prevOutX[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index,
-          scale * grad[ty] * prevOutX[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] += output[ty] * grad[ty] *
-        (prevOutY[index] * reciprocalXY -
-         prevOutX[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prevGradY[index] += output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-
-void hl_cossim_derivative(real* grad,
-                          real* output,
-                          real* prevOutX,
-                          real* prevOutY,
-                          real* prevGradX,
-                          real* prevGradY,
-                          int width,
-                          int input1_height,
-                          int input2_height,
-                          real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prevOutX);
-  CHECK_NOTNULL(prevOutY);
-  CHECK_NOTNULL(prevGradX);
-  CHECK_NOTNULL(prevGradY);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
-        input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim_derivate failed");
-}
-
 __global__ void KeMatrixAddSharedBias(real* A,
                                      real* B,
                                      const int channel,
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@ -34,7 +34,6 @@ void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
  CHECK(in2_mat->getHeight() == 1LU || in2_mat->getHeight() == num_samples);
  size_t inc = (in2_mat->getHeight() == 1LU) ? 0 : dim;
  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
-    /// for each row, todo(tianbing), use TensorExpression square2 ?
    real square_sum_x = 0;
    real square_sum_y = 0;
    real xy = 0;
@ -147,12 +146,15 @@ void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix* out_grad,
 }

 /**
- * \param inputs[0] output value 1, size: nSamples * 1.
- * \param inputs[1] input value 1, size: nSamples * dim.
- * \param inputs[2] input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param inputs[3] input grad 1, size: nSamples * dim.
- * \param inputs[4] input grad 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param outputs[0] output grad, size : nSamples * 1.
+ * \param inouts[0] forward input grad 1, size: nSamples * dim.
+ * \param inouts[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
 */
 template <DeviceType Device>
 class CosSimBackwardFunc : public FunctionBase {
@ -163,35 +165,35 @@ class CosSimBackwardFunc : public FunctionBase {
  void calc(const Arguments& inputs,
            const Arguments& outputs,
            const Arguments& inouts) override {
-    CHECK_EQ(inputs.size(), 5);
-    CHECK_EQ(outputs.size(), 1);
-    CHECK_EQ(inouts.size(), 0);
+    CHECK_EQ(inputs.size(), 4);
+    CHECK_EQ(outputs.size(), 0);
+    CHECK_EQ(inouts.size(), 2);
    /// dim of out_grad and out_val == 1, column vector
-    CHECK_EQ(outputs[0].dims_[1], 1UL);
    CHECK_EQ(inputs[0].dims_[1], 1UL);
+    CHECK_EQ(inputs[1].dims_[1], 1UL);
    /// nSamples of out_grad == out_val == in_val1 == in_grad1
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
-    CHECK_EQ(inputs[1].dims_[0], outputs[0].dims_[0]);
-    CHECK_EQ(inputs[3].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(inputs[1].dims_[0], inputs[0].dims_[0]);
+    CHECK_EQ(inputs[0].dims_[0], inputs[0].dims_[0]);
+    CHECK_EQ(inouts[0].dims_[0], inputs[0].dims_[0]);
    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
-    CHECK_EQ(inputs[2].dims_[1], inputs[1].dims_[1]);
-    CHECK_EQ(inputs[3].dims_[1], inputs[1].dims_[1]);
-    CHECK_EQ(inputs[4].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[3].dims_[1], inputs[2].dims_[1]);
+    CHECK_EQ(inouts[0].dims_[1], inputs[2].dims_[1]);
+    CHECK_EQ(inouts[1].dims_[1], inputs[2].dims_[1]);

-    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[1].getData() &&
-          inputs[2].getData() && inputs[3].getData() && inputs[4].getData());
+    CHECK(inputs[0].getData() && inputs[1].getData() && inputs[2].getData() &&
+          inputs[3].getData() && inouts[0].getData() && inouts[1].getData());
    const auto out_grad = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto out_val = std::make_shared<typename MatrixT<Device>::type>(
        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    const auto in1_val = std::make_shared<typename MatrixT<Device>::type>(
+    const auto out_val = std::make_shared<typename MatrixT<Device>::type>(
        inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    const auto in2_val = std::make_shared<typename MatrixT<Device>::type>(
+    const auto in1_val = std::make_shared<typename MatrixT<Device>::type>(
        inputs[2].getData(), inputs[2].dims_[0], inputs[2].dims_[1]);
-    auto in1_grad = std::make_shared<typename MatrixT<Device>::type>(
+    const auto in2_val = std::make_shared<typename MatrixT<Device>::type>(
        inputs[3].getData(), inputs[3].dims_[0], inputs[3].dims_[1]);
+    auto in1_grad = std::make_shared<typename MatrixT<Device>::type>(
+        inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
    auto in2_grad = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[4].getData(), inputs[4].dims_[0], inputs[4].dims_[1]);
+        inouts[1].getData(), inouts[1].dims_[0], inouts[1].dims_[1]);

    CosSimBackward<Device>(out_grad.get(),
                           out_val.get(),
--- a/paddle/function/CosSimOp.h
+++ b/paddle/function/CosSimOp.h
@ -25,9 +25,9 @@ namespace paddle {
 *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
 *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
 *
- * \param[out]  output            output data.
- * \param[in]   intput1           input data.
- * \param[in]   intput2           input data.
+ * \param[out]  output            output value.
+ * \param[in]   intput1           input value.
+ * \param[in]   intput2           input value.
 * \param[in]   scale             default 1.0.
 *
 */
@ -40,13 +40,13 @@ void CosSimForward(typename MatrixT<Device>::type* output,
 /**
 * \brief   Cosine Similarity BackWard for Derivative.
 *
- * \param[out]  output1           backward loss output grad.
- * \param[in]   input1            forward-output value.
- * \param[in]   input2            forward input value 1.
- * \param[in]   input3            forward input value 2.
- * \param[in]   input4            forward input grad 1.
- * \param[in]   input5            forward input grad 2.
- * \param[in]   scale             default 1.0.
+ * \param[in]       output grad           backward loss output grad.
+ * \param[in]       output val            forward-output value.
+ * \param[in]       input val1            forward input value 1.
+ * \param[in]       input val2            forward input value 2.
+ * \param[in/out]   input grad            forward input grad 1.
+ * \param[in/out]   input grad            forward input grad 2.
+ * \param[in]       scale                 default 1.0.
 *
 */
 template <DeviceType Device>
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
@ -97,22 +97,22 @@ void testCosSimBackward(size_t height_x,
  gpu_in2_grad.copyFrom(cpu_in2_grad);

  compare.getCpuFunction()->calc(
-      {Tensor(cpu_out_val.getData(), Dims{height_x, 1}),
+      {Tensor(cpu_out_grad.getData(), Dims{height_x, 1}),
+       Tensor(cpu_out_val.getData(), Dims{height_x, 1}),
       Tensor(cpu_in1_val.getData(), Dims{height_x, width}),
-       Tensor(cpu_in2_val.getData(), Dims{height_x, width}),
-       Tensor(cpu_in1_grad.getData(), Dims{height_x, width}),
-       Tensor(cpu_in2_grad.getData(), Dims{height_x, width})},
-      {Tensor(cpu_out_grad.getData(), Dims{height_x, 1})},
-      {});
+       Tensor(cpu_in2_val.getData(), Dims{height_x, width})},
+      {},
+      {Tensor(cpu_in1_grad.getData(), Dims{height_x, width}),
+       Tensor(cpu_in2_grad.getData(), Dims{height_x, width})});

  compare.getGpuFunction()->calc(
-      {Tensor(gpu_out_val.getData(), Dims{height_x, 1}),
+      {Tensor(gpu_out_grad.getData(), Dims{height_x, 1}),
+       Tensor(gpu_out_val.getData(), Dims{height_x, 1}),
       Tensor(gpu_in1_val.getData(), Dims{height_x, width}),
-       Tensor(gpu_in2_val.getData(), Dims{height_x, width}),
-       Tensor(gpu_in1_grad.getData(), Dims{height_x, width}),
-       Tensor(gpu_in2_grad.getData(), Dims{height_x, width})},
-      {Tensor(gpu_out_grad.getData(), Dims{height_x, 1})},
-      {});
+       Tensor(gpu_in2_val.getData(), Dims{height_x, width})},
+      {},
+      {Tensor(gpu_in1_grad.getData(), Dims{height_x, width}),
+       Tensor(gpu_in2_grad.getData(), Dims{height_x, width})});

  autotest::TensorCheckErr(cpu_in1_grad, gpu_in1_grad);
  autotest::TensorCheckErr(cpu_in2_grad, gpu_in2_grad);
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@ -79,13 +79,13 @@ void CosSimLayer::backward(const UpdateCallback& callback) {
    auto inG2 = this->getInputGrad(1);
    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
    backward_[0]->calc(
-        {Tensor(outV->getData(), Dims{outV->getHeight(), outV->getWidth()}),
+        {Tensor(outG->getData(), Dims{outG->getHeight(), outG->getWidth()}),
+         Tensor(outV->getData(), Dims{outV->getHeight(), outV->getWidth()}),
         Tensor(inV1->getData(), Dims{inV1->getHeight(), inV1->getWidth()}),
-         Tensor(inV2->getData(), Dims{inV2->getHeight(), inV2->getWidth()}),
-         Tensor(inG1->getData(), Dims{inG1->getHeight(), inG1->getWidth()}),
-         Tensor(inG2->getData(), Dims{inG2->getHeight(), inG2->getWidth()})},
-        {Tensor(outG->getData(), Dims{outG->getHeight(), outG->getWidth()})},
-        {});
+         Tensor(inV2->getData(), Dims{inV2->getHeight(), inV2->getWidth()})},
+        {},
+        {Tensor(inG1->getData(), Dims{inG1->getHeight(), inG1->getWidth()}),
+         Tensor(inG2->getData(), Dims{inG2->getHeight(), inG2->getWidth()})});
  }
 }

--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@ -169,19 +169,19 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
    tmpRow3->setData(outG->rowBuf(i));

    backward_[0]->calc(
-        {Tensor(tmpRow2->getData(),
+        {Tensor(tmpRow3->getData(),
+                Dims{tmpRow3->getHeight(), tmpRow3->getWidth()}),
+         Tensor(tmpRow2->getData(),
                Dims{tmpRow2->getHeight(), tmpRow2->getWidth()}),
         Tensor(tmpMtx0->getData(),
                Dims{tmpMtx0->getHeight(), tmpMtx0->getWidth()}),
         Tensor(tmpRow0->getData(),
-                Dims{tmpRow0->getHeight(), tmpRow0->getWidth()}),
-         Tensor(tmpMtx1->getData(),
+                Dims{tmpRow0->getHeight(), tmpRow0->getWidth()})},
+        {},
+        {Tensor(tmpMtx1->getData(),
                Dims{tmpMtx1->getHeight(), tmpMtx1->getWidth()}),
         Tensor(tmpRow1->getData(),
-                Dims{tmpRow1->getHeight(), tmpRow1->getWidth()})},
-        {Tensor(tmpRow3->getData(),
-                Dims{tmpRow3->getHeight(), tmpRow3->getWidth()})},
-        {});
+                Dims{tmpRow1->getHeight(), tmpRow1->getWidth()})});
  }
 }

--- a/paddle/gserver/layers/CosSimVecMatLayer.h
+++ b/paddle/gserver/layers/CosSimVecMatLayer.h
@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma once
+
 #include "Layer.h"
 #include "paddle/math/Matrix.h"

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@ -941,59 +941,6 @@ void GpuMatrix::softreluDerivative(Matrix& output) {
 void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
  BaseMatrix::scaledTanh(output, p1, p2);
 }
-void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  CHECK(output1.useGpu_ == true && output2.useGpu_ == true)
-      << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-  real* out = getData();
-  real* x = output1.getData();
-  real* y = output2.getData();
-  hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
-}
-void GpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
-        prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
-        prevGrad2.useGpu_ == true)
-      << "Matrix type are not equal";
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  real* grad = getData();
-  real* out = output.getData();
-  real* prevOutX = prevOut1.getData();
-  real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad,
-                       out,
-                       prevOutX,
-                       prevOutY,
-                       prevGradX,
-                       prevGradY,
-                       dim,
-                       prevOut1.getHeight(),
-                       prevOut2.getHeight(),
-                       scale);
-}

 void GpuMatrix::randomizeUniform() {
  CHECK(isContiguous());
@ -3470,105 +3417,6 @@ void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
  }
 }

-void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-
-  real* out = getData();
-  const real* x = output1.getData();
-  const real* y = output2.getData();
-  size_t yInc = dim;
-  if (output2.getHeight() == 1LU) {
-    yInc = 0;
-  } else {
-    CHECK_EQ(output2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i, x += dim, y += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(x[j]);
-      squareSumY += _square(y[j]);
-      xy += x[j] * y[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    out[i] = scale * xy / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-  }
-}
-
-void CpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  const real* grad = getData();
-  const real* out = output.getData();
-  const real* prevOutX = prevOut1.getData();
-  const real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  size_t yInc = dim;
-  if (prevOut2.getHeight() == 1LU) {
-    yInc = 0;
-    CHECK_EQ(prevGrad2.getHeight(), 1LU);
-  } else {
-    CHECK_EQ(prevOut2.getHeight(), numSamples);
-    CHECK_EQ(prevGrad2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i,
-              prevOutX += dim,
-              prevOutY += yInc,
-              prevGradX += dim,
-              prevGradY += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(prevOutX[j]);
-      squareSumY += _square(prevOutY[j]);
-      xy += prevOutX[j] * prevOutY[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    if (xy == 0) {
-      real reciprocal = 1.0f / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += scale * grad[i] * prevOutY[j] * reciprocal;
-        prevGradY[j] += scale * grad[i] * prevOutX[j] * reciprocal;
-      }
-    } else {
-      real reciprocalXY = 1.0f / xy;
-      real reciprocalSquareSumX = 1.0f / squareSumX;
-      real reciprocalSquareSumY = 1.0f / squareSumY;
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += out[i] * grad[i] * (prevOutY[j] * reciprocalXY -
-                                            prevOutX[j] * reciprocalSquareSumX);
-        prevGradY[j] += out[i] * grad[i] * (prevOutX[j] * reciprocalXY -
-                                            prevOutY[j] * reciprocalSquareSumY);
-      }
-    }
-  }
-}
-
 void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
  CHECK(output.useGpu_ == false && label.useGpu_ == false)
      << "Matrix type are not equal";
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@ -799,26 +799,6 @@ public:
    LOG(FATAL) << "Not implemented";
  }

-  /**
-   * cosine similarity, for each row i,
-   *   this[i] = cos(output1[i], output2[i])
-   *
-   * output2 can only have one row, then for each row i,
-   *   this[i] = cos(output1[i], output2[0])
-   */
-  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void cosSimDerivative(Matrix& output,
-                                Matrix& prevOut1,
-                                Matrix& prevOut2,
-                                Matrix& prevGrad1,
-                                Matrix& prevGrad2,
-                                real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
  /// print out the values of elements to os
  virtual void print(std::ostream& os) const {
    LOG(FATAL) << "Not implemented";
@ -1324,14 +1304,6 @@ public:
  void softreluDerivative(Matrix& output);
  void scaledTanh(Matrix& output, real p1, real p2);

-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
  virtual void print(std::ostream& os) const;
  virtual void print(std::ostream& os, size_t height, size_t width) const;

@ -1752,14 +1724,6 @@ public:
  void softreluDerivative(Matrix& output);
  void scaledTanh(Matrix& output, real p1, real p2);

-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
  void print(std::ostream& os) const;
  void print(std::ostream& os, size_t height, size_t width) const;
  void printOneRow(std::ostream& os, size_t idx) const;
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@ -181,28 +181,6 @@ TEST(Matrix, copyByRowIndex) {
  }
 }

-void testCosSim(int heightX, int heightY, int width, real scale) {
-  AutoCompare test(heightX, 1);
-  CpuMatrix arg1(heightX, width);
-  CpuMatrix arg2(heightY, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg2.add(-0.5);
-  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluForward(int height, int width, int w_height, int w_width) {
  AutoCompare test(height, width);
  CpuMatrix arg1(height, width);
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@ -720,61 +720,6 @@ TEST(Matrix, sequenceAvgForward) {
  }
 }

-void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr prevGradX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevGradY = CpuMatrix::create(heightY, width, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  grad->randomizeUniform();
-  output->randomizeUniform();
-  prevGradX->randomizeUniform();
-  prevGradY->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr gradGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr prevGradXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevGradYGpu = GpuMatrix::create(heightY, width, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  gradGpu->copyFrom(*grad);
-  outputGpu->copyFrom(*output);
-  prevGradXGpu->copyFrom(*prevGradX);
-  prevGradYGpu->copyFrom(*prevGradY);
-
-  grad->cosSimDerivative(
-      *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale);
-
-  gradGpu->cosSimDerivative(*outputGpu,
-                            *prevOutXGpu,
-                            *prevOutYGpu,
-                            *prevGradXGpu,
-                            *prevGradYGpu,
-                            scale);
-
-  TensorCheckErr(*prevGradX, *prevGradXGpu);
-  TensorCheckErr(*prevGradY, *prevGradYGpu);
-}
-
-TEST(Matrix, cosSimDerivate) {
-  for (auto heightX : {1, 10, 100}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {1, 10, 100}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimDerivate(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluBackwardDiff(int height,
                               int width,
                               int w_height,