From 784740d8bee8e9127270edd1288289e9d9c864b8 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 11 Dec 2017 18:29:05 +0800 Subject: [PATCH 1/8] refine cos-sim-op --- paddle/operators/cos_sim_op.h | 189 +++++++++++++-------- paddle/operators/elementwise_op_function.h | 55 ++++++ 2 files changed, 170 insertions(+), 74 deletions(-) diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index fecb5a79b2..3a7e67506d 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/elementwise_add_op.h" namespace paddle { namespace operators { @@ -27,6 +28,28 @@ template using EigenVector = framework::EigenVector; +template +void Function_forward(T* out, T* x_norm, T* y_norm, + ElementIterator& x, + ElementIterator& y, int row, int col) { + for (int i = 0; i < row; ++i) { + T xx = 0; + T yy = 0; + T xy = 0; + for (int j = 0; j < col; ++j) { + xy += (*x) * (*y); + xx += (*x) * (*x); + yy += (*y) * (*y); + ++y; + ++x; + } + x_norm[i] = sqrt(xx); + y_norm[i] = sqrt(yy); + + out[i] = xy / (x_norm[i] * y_norm[i]); + } +} + template class CosSimKernel : public framework::OpKernel { public: @@ -41,32 +64,63 @@ class CosSimKernel : public framework::OpKernel { out_x_norm->mutable_data(context.GetPlace()); out_y_norm->mutable_data(context.GetPlace()); - // convert Tensor to Eigen Tensor int rows_x = in_x->dims()[0]; int rows_y = in_y->dims()[0]; - auto x = EigenMatrix::Reshape(*in_x, 1); - auto y = EigenMatrix::Reshape(*in_y, 1); - auto z = EigenVector::Flatten(*out_z); - auto x_norm = EigenVector::Flatten(*out_x_norm); - auto y_norm = EigenVector::Flatten(*out_y_norm); - - // compute - auto& place = - *context.template device_context().eigen_device(); - auto row_along = Eigen::array({{1}}); - x_norm.device(place) = x.square().sum(row_along).sqrt(); - y_norm.device(place) = y.square().sum(row_along).sqrt(); - if (rows_x == rows_y) { - auto xy = (x * y).sum(Eigen::array({{1}})); - z.device(place) = xy / x_norm / y_norm; - } else { - Eigen::DSizes bcast(rows_x, 1); - auto xy = (x * y.broadcast(bcast)).sum(row_along); - z.device(place) = xy / x_norm / y_norm.broadcast(bcast); - } + + int cols = framework::product(in_x->dims()) / rows_x; + auto x_iter = ElementIterator(in_x->data(), rows_x, + cols, rows_x, cols); + auto y_iter = ElementIterator(in_y->data(), rows_y, + cols, rows_x, cols); + + Function_forward(out_z->data(), out_x_norm->data(), + out_y_norm->data(), x_iter, y_iter, rows_x, cols); + // + // // convert Tensor to Eigen Tensor + //// int rows_x = in_x->dims()[0]; + //// int rows_y = in_y->dims()[0]; + // auto x = EigenMatrix::Reshape(*in_x, 1); + // auto y = EigenMatrix::Reshape(*in_y, 1); + // auto z = EigenVector::Flatten(*out_z); + // auto x_norm = EigenVector::Flatten(*out_x_norm); + // auto y_norm = EigenVector::Flatten(*out_y_norm); + // + // // compute + // auto& place = + // *context.template device_context().eigen_device(); + // auto row_along = Eigen::array({{1}}); + // x_norm.device(place) = x.square().sum(row_along).sqrt(); + // y_norm.device(place) = y.square().sum(row_along).sqrt(); + // if (rows_x == rows_y) { + // auto xy = (x * y).sum(Eigen::array({{1}})); + // z.device(place) = xy / x_norm / y_norm; + // } else { + // Eigen::DSizes bcast(rows_x, 1); + // auto xy = (x * y.broadcast(bcast)).sum(row_along); + // z.device(place) = xy / x_norm / y_norm.broadcast(bcast); + // } } }; +template +void Function_element(T* result, ElementIterator dz, + ElementIterator y, + ElementIterator x_norm, + ElementIterator y_norm, + ElementIterator z, + ElementIterator x, int num, int block) { + for (int i = 0; i < num; ++i) { + result[i % block] += (*dz) * ((*y) / ((*x_norm) * (*y_norm)) - + (*z) * (*x) / ((*x_norm) * (*x_norm))); + ++dz; + ++y; + ++x_norm; + ++y_norm; + ++z; + ++x; + } +} + template class CosSimGradKernel : public framework::OpKernel { public: @@ -81,63 +135,50 @@ class CosSimGradKernel : public framework::OpKernel { auto* out_grad_y = context.Output(framework::GradVarName("Y")); auto* in_grad_z = context.Input(framework::GradVarName("Out")); - // convert Tensor to Eigen Tensor - auto x = EigenMatrix::Reshape(*in_x, 1); - auto y = EigenMatrix::Reshape(*in_y, 1); - auto z = EigenMatrix::Reshape(*in_z, 1); - auto x_norm = EigenMatrix::Reshape(*in_x_norm, 1); - auto y_norm = EigenMatrix::Reshape(*in_y_norm, 1); - auto dz = EigenMatrix::Reshape(*in_grad_z, 1); - // compute gradident int rows_x = in_x->dims()[0]; int rows_y = in_y->dims()[0]; int cols = framework::product(in_x->dims()) / rows_x; - Eigen::DSizes bcast_cols(1, cols); - auto z_bcast = z.broadcast(bcast_cols); - auto dz_bcast = dz.broadcast(bcast_cols); - auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols); - auto& place = - *context.template device_context().eigen_device(); - if (rows_x == rows_y) { - auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols); - auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols); - // compute dx - if (out_grad_x) { - out_grad_x->mutable_data(context.GetPlace()); - auto dx = EigenMatrix::Reshape(*out_grad_x, 1); - auto grad = y / norm_prod_bcast - z_bcast * x / x_snorm_bcast; - dx.device(place) = dz_bcast * grad; - } - // compute dy - if (out_grad_y) { - out_grad_y->mutable_data(context.GetPlace()); - auto dy = EigenMatrix::Reshape(*out_grad_y, 1); - auto grad = x / norm_prod_bcast - z_bcast * y / y_snorm_bcast; - dy.device(place) = dz_bcast * grad; - } - } else { - Eigen::DSizes bcast_rows(rows_x, 1); - Eigen::DSizes bcast_rows_cols(rows_x, cols); - auto y_bcast = y.broadcast(bcast_rows); - auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_rows_cols); - auto norm_prod_bcast = (x_norm * y_norm.eval().broadcast(bcast_rows)) - .eval() - .broadcast(bcast_cols); - // compute dx - if (out_grad_x) { - out_grad_x->mutable_data(context.GetPlace()); - auto dx = EigenMatrix::Reshape(*out_grad_x, 1); - auto grad = y_bcast / norm_prod_bcast - z_bcast * x / x_snorm_bcast; - dx.device(place) = dz_bcast * grad; - } - // compute dy - if (out_grad_y) { - out_grad_y->mutable_data(context.GetPlace()); - auto dy = EigenVector::Flatten(*out_grad_y); - auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast; - dy.device(place) = (dz_bcast * grad).sum(Eigen::array({{0}})); - } + + ////////////////////////////// + // ## + auto x_iter = ElementIterator(in_x->data(), rows_x, + cols, rows_x, cols); + auto y_iter = ElementIterator(in_y->data(), rows_y, + cols, rows_x, cols); + auto z_iter = ElementIterator(in_z->data(), rows_x, 1, + rows_x, cols); + auto dz_iter = ElementIterator(in_grad_z->data(), + rows_x, 1, rows_x, cols); + auto x_norm_iter = ElementIterator( + in_x_norm->data(), rows_x, 1, rows_x, cols); + auto y_norm_iter = ElementIterator( + in_y_norm->data(), rows_y, 1, rows_x, cols); + // ## + ////////////////////////////// + // compute dx + if (out_grad_x) { + out_grad_x->mutable_data(context.GetPlace()); + + ////////////////////////////// + // ## + Function_element(out_grad_x->data(), dz_iter, y_iter, x_norm_iter, + y_norm_iter, z_iter, x_iter, rows_x * cols, + rows_x * cols); + // ## + ////////////////////////////// + } + // compute dy + if (out_grad_y) { + out_grad_y->mutable_data(context.GetPlace()); + + ////////////////////////////// + // ## + Function_element(out_grad_y->data(), dz_iter, x_iter, y_norm_iter, + x_norm_iter, z_iter, y_iter, rows_x * cols, + rows_y * cols); + // ## + ////////////////////////////// } } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 7ebfc7df8c..33b7d06467 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -131,6 +131,61 @@ class MidWiseTransformIterator { int post_; }; +template +class ElementIterator; + +// Fixed(zcd) : Only support 2D +template +class ElementIterator { + public: + ElementIterator(const T* ptr, int t_m, int t_n, int m, int n) + : ptr_(ptr), + index_(0), + i_(0), + j_(0), + t_m_(t_m), + t_n_(t_n), + m_(m), + n_(n) {} + + ElementIterator& operator++() { + ++j_; + + if ((j_ == n_)) { + j_ = 0; + ++i_; + } + int t_i = (t_m_ == 1) ? 0 : i_; + int t_j = (t_n_ == 1) ? 0 : j_; + index_ = t_i * t_n_ + t_j; + + return *this; + } + + bool operator==( + const ElementIterator& rhs) const { + return (ptr_ + index_) == &(*rhs); + } + + bool operator!=( + const ElementIterator& rhs) const { + return (ptr_ + index_) != &(*rhs); + } + + const T& operator*() { return ptr_[index_]; } + + private: + // t_m_ == m_ || t_n_ == n_ || (t_m_ == 1 && t_m_ == 1) + const T* ptr_; + int index_; + int i_; + int j_; + int64_t t_m_; + int64_t t_n_; + int64_t m_; + int64_t n_; +}; + #ifdef __NVCC__ template class RowwiseTransformIterator From bcf0b56f6a0d649e1f7e71df485d2d0f2a278a77 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Sat, 23 Dec 2017 17:59:43 +0800 Subject: [PATCH 2/8] refine iterator --- paddle/operators/cos_sim_op.h | 335 ++++++++++++++------- paddle/operators/elementwise_op_function.h | 55 ---- 2 files changed, 229 insertions(+), 161 deletions(-) diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 3a7e67506d..e96592ab28 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" -#include "paddle/operators/elementwise_add_op.h" +#include "paddle/operators/elementwise_op_function.h" namespace paddle { namespace operators { @@ -28,27 +28,73 @@ template using EigenVector = framework::EigenVector; -template -void Function_forward(T* out, T* x_norm, T* y_norm, - ElementIterator& x, - ElementIterator& y, int row, int col) { - for (int i = 0; i < row; ++i) { - T xx = 0; +template +static void ForEachZip(IT1 begin1, IT1 last1, IT2 begin2, Callback callback) { + // This method could be implemented in CUDA + for (; begin1 < last1; ++begin1, ++begin2) { + callback(*begin1, *begin2); + } +} + +template +struct CosSimFunctor { + CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + cols_(static_cast(cols)) {} + + inline void operator()(T& x_norm, T& y_norm) const { + size_t x_offset = &x_norm - x_norm_; + size_t y_offset = &y_norm - y_norm_; + + auto* x = x_ + cols_ * x_offset; + + T xx = 0, xy = 0; T yy = 0; - T xy = 0; - for (int j = 0; j < col; ++j) { - xy += (*x) * (*y); - xx += (*x) * (*x); - yy += (*y) * (*y); - ++y; - ++x; + if (same_row) { + auto* y = y_ + cols_ * y_offset; + for (size_t i = 0; i < cols_; ++i) { + xx += x[i] * x[i]; + yy += y[i] * y[i]; + xy += x[i] * y[i]; + } + xx = sqrt(xx); + yy = sqrt(yy); + x_norm_[x_offset] = xx; + y_norm_[y_offset] = yy; + z_[x_offset] = xy / (xx * yy); + } else { + auto* y = y_; + // if (yy == -1) { + // yy = 0; + // for (size_t i = 0; i < cols_; ++i) { + // yy += y[i] * y[i]; + // } + // y_norm[0] = sqrt(yy); + // } + for (size_t i = 0; i < cols_; ++i) { + xx += x[i] * x[i]; + yy += y[i] * y[i]; // only need + xy += x[i] * y[i]; + } + xx = sqrt(xx); + yy = sqrt(yy); + x_norm_[x_offset] = xx; + y_norm_[0] = yy; + z_[x_offset] = xy / (xx * yy); } - x_norm[i] = sqrt(xx); - y_norm[i] = sqrt(yy); - - out[i] = xy / (x_norm[i] * y_norm[i]); } -} + + T* x_norm_; + T* y_norm_; + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; template class CosSimKernel : public framework::OpKernel { @@ -68,58 +114,140 @@ class CosSimKernel : public framework::OpKernel { int rows_y = in_y->dims()[0]; int cols = framework::product(in_x->dims()) / rows_x; - auto x_iter = ElementIterator(in_x->data(), rows_x, - cols, rows_x, cols); - auto y_iter = ElementIterator(in_y->data(), rows_y, - cols, rows_x, cols); - - Function_forward(out_z->data(), out_x_norm->data(), - out_y_norm->data(), x_iter, y_iter, rows_x, cols); - // - // // convert Tensor to Eigen Tensor - //// int rows_x = in_x->dims()[0]; - //// int rows_y = in_y->dims()[0]; - // auto x = EigenMatrix::Reshape(*in_x, 1); - // auto y = EigenMatrix::Reshape(*in_y, 1); - // auto z = EigenVector::Flatten(*out_z); - // auto x_norm = EigenVector::Flatten(*out_x_norm); - // auto y_norm = EigenVector::Flatten(*out_y_norm); - // - // // compute - // auto& place = - // *context.template device_context().eigen_device(); - // auto row_along = Eigen::array({{1}}); - // x_norm.device(place) = x.square().sum(row_along).sqrt(); - // y_norm.device(place) = y.square().sum(row_along).sqrt(); - // if (rows_x == rows_y) { - // auto xy = (x * y).sum(Eigen::array({{1}})); - // z.device(place) = xy / x_norm / y_norm; - // } else { - // Eigen::DSizes bcast(rows_x, 1); - // auto xy = (x * y.broadcast(bcast)).sum(row_along); - // z.device(place) = xy / x_norm / y_norm.broadcast(bcast); - // } + + if (rows_x == rows_y) { + CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + ForEachZip(out_x_norm->data(), out_x_norm->data() + rows_x, + out_y_norm->data(), functor); + } else { + CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + ForEachZip(out_x_norm->data(), out_x_norm->data() + rows_x, + out_y_norm->data(), functor); + } } }; -template -void Function_element(T* result, ElementIterator dz, - ElementIterator y, - ElementIterator x_norm, - ElementIterator y_norm, - ElementIterator z, - ElementIterator x, int num, int block) { - for (int i = 0; i < num; ++i) { - result[i % block] += (*dz) * ((*y) / ((*x_norm) * (*y_norm)) - - (*z) * (*x) / ((*x_norm) * (*x_norm))); - ++dz; - ++y; - ++x_norm; - ++y_norm; - ++z; - ++x; +template +struct CosSimGradFunctor { + CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + void operator()(const T& x_norm, const T& y_norm) const { + size_t x_offset = &x_norm - x_norm_; + size_t y_offset = &y_norm - y_norm_; + + auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; + // auto y_norm_square = y_norm_[y_offset] * y_norm_[y_offset]; + auto xy_norm_prod = x_norm_[x_offset] * y_norm_[y_offset]; + auto dz = dz_[x_offset]; + + auto* dx = dx_ + cols_ * x_offset; + auto* x = x_ + cols_ * x_offset; + auto* y = y_ + cols_ * y_offset; + auto z = z_[x_offset]; + + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y[i] / xy_norm_prod - z * x[i] / x_norm_square); + } } -} + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDxFunctor { + CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + void operator()(const T& x_norm, const T& y_norm) const { + size_t x_offset = &x_norm - x_norm_; + + auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; + auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; + auto dz = dz_[x_offset]; + auto z = z_[x_offset]; + + auto* dx = dx_ + cols_ * x_offset; + auto* x = x_ + cols_ * x_offset; + + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] / xy_norm_prod - z * x[i] / x_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDyFunctor { + CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dy, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dy_(dy), + cols_(static_cast(cols)) {} + + void operator()(const T& x_norm, const T& y_norm) const { + size_t x_offset = &x_norm - x_norm_; + + auto y_norm_square = y_norm_[0] * y_norm_[0]; + auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; + auto dz = dz_[x_offset]; + auto z = z_[x_offset]; + auto* x = x_ + cols_ * x_offset; + + for (size_t i = 0; i < cols_; ++i) { + dy_[i] += dz * (x[i] / xy_norm_prod - z * y_[i] / y_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dy_; + const size_t cols_; +}; template class CosSimGradKernel : public framework::OpKernel { @@ -140,45 +268,40 @@ class CosSimGradKernel : public framework::OpKernel { int rows_y = in_y->dims()[0]; int cols = framework::product(in_x->dims()) / rows_x; - ////////////////////////////// - // ## - auto x_iter = ElementIterator(in_x->data(), rows_x, - cols, rows_x, cols); - auto y_iter = ElementIterator(in_y->data(), rows_y, - cols, rows_x, cols); - auto z_iter = ElementIterator(in_z->data(), rows_x, 1, - rows_x, cols); - auto dz_iter = ElementIterator(in_grad_z->data(), - rows_x, 1, rows_x, cols); - auto x_norm_iter = ElementIterator( - in_x_norm->data(), rows_x, 1, rows_x, cols); - auto y_norm_iter = ElementIterator( - in_y_norm->data(), rows_y, 1, rows_x, cols); - // ## - ////////////////////////////// - // compute dx - if (out_grad_x) { - out_grad_x->mutable_data(context.GetPlace()); - - ////////////////////////////// - // ## - Function_element(out_grad_x->data(), dz_iter, y_iter, x_norm_iter, - y_norm_iter, z_iter, x_iter, rows_x * cols, - rows_x * cols); - // ## - ////////////////////////////// - } - // compute dy - if (out_grad_y) { - out_grad_y->mutable_data(context.GetPlace()); - - ////////////////////////////// - // ## - Function_element(out_grad_y->data(), dz_iter, x_iter, y_norm_iter, - x_norm_iter, z_iter, y_iter, rows_x * cols, - rows_y * cols); - // ## - ////////////////////////////// + if (rows_x == rows_y) { + if (out_grad_x) { + CosSimGradFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, + in_y_norm->data(), functor); + } + if (out_grad_y) { + CosSimGradFunctor functor( + in_y_norm->data(), in_x_norm->data(), in_y->data(), + in_x->data(), in_z->data(), in_grad_z->data(), + out_grad_y->mutable_data(context.GetPlace()), cols); + ForEachZip(in_y_norm->data(), in_y_norm->data() + rows_x, + in_x_norm->data(), functor); + } + } else { + if (out_grad_x) { + CosSimDxFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, + in_y_norm->data(), functor); + } + if (out_grad_y) { + CosSimDyFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_y->mutable_data(context.GetPlace()), cols); + ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, + in_y_norm->data(), functor); + } } } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 33b7d06467..7ebfc7df8c 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -131,61 +131,6 @@ class MidWiseTransformIterator { int post_; }; -template -class ElementIterator; - -// Fixed(zcd) : Only support 2D -template -class ElementIterator { - public: - ElementIterator(const T* ptr, int t_m, int t_n, int m, int n) - : ptr_(ptr), - index_(0), - i_(0), - j_(0), - t_m_(t_m), - t_n_(t_n), - m_(m), - n_(n) {} - - ElementIterator& operator++() { - ++j_; - - if ((j_ == n_)) { - j_ = 0; - ++i_; - } - int t_i = (t_m_ == 1) ? 0 : i_; - int t_j = (t_n_ == 1) ? 0 : j_; - index_ = t_i * t_n_ + t_j; - - return *this; - } - - bool operator==( - const ElementIterator& rhs) const { - return (ptr_ + index_) == &(*rhs); - } - - bool operator!=( - const ElementIterator& rhs) const { - return (ptr_ + index_) != &(*rhs); - } - - const T& operator*() { return ptr_[index_]; } - - private: - // t_m_ == m_ || t_n_ == n_ || (t_m_ == 1 && t_m_ == 1) - const T* ptr_; - int index_; - int i_; - int j_; - int64_t t_m_; - int64_t t_n_; - int64_t m_; - int64_t n_; -}; - #ifdef __NVCC__ template class RowwiseTransformIterator From 49df2a784be8dabda85f82620ff4601ce113d332 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 25 Dec 2017 20:17:27 +0800 Subject: [PATCH 3/8] refine gradient function --- paddle/operators/cos_sim_op.h | 101 +++++++++++----------------------- 1 file changed, 33 insertions(+), 68 deletions(-) diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index e96592ab28..cd5c703c30 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -13,7 +13,6 @@ limitations under the License. */ #pragma once -#include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" #include "paddle/operators/elementwise_op_function.h" @@ -21,16 +20,9 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; -template -using EigenVector = framework::EigenVector; template static void ForEachZip(IT1 begin1, IT1 last1, IT2 begin2, Callback callback) { - // This method could be implemented in CUDA for (; begin1 < last1; ++begin1, ++begin2) { callback(*begin1, *begin2); } @@ -66,15 +58,8 @@ struct CosSimFunctor { x_norm_[x_offset] = xx; y_norm_[y_offset] = yy; z_[x_offset] = xy / (xx * yy); - } else { + } else { // This can be wrote in a better way. auto* y = y_; - // if (yy == -1) { - // yy = 0; - // for (size_t i = 0; i < cols_; ++i) { - // yy += y[i] * y[i]; - // } - // y_norm[0] = sqrt(yy); - // } for (size_t i = 0; i < cols_; ++i) { xx += x[i] * x[i]; yy += y[i] * y[i]; // only need @@ -144,22 +129,25 @@ struct CosSimGradFunctor { dx_(dx), cols_(static_cast(cols)) {} - void operator()(const T& x_norm, const T& y_norm) const { + inline void operator()(const T& x_norm, const T& y_norm) const { size_t x_offset = &x_norm - x_norm_; size_t y_offset = &y_norm - y_norm_; auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; - // auto y_norm_square = y_norm_[y_offset] * y_norm_[y_offset]; auto xy_norm_prod = x_norm_[x_offset] * y_norm_[y_offset]; auto dz = dz_[x_offset]; + auto z = z_[x_offset]; auto* dx = dx_ + cols_ * x_offset; auto* x = x_ + cols_ * x_offset; + auto* y = y_ + cols_ * y_offset; - auto z = z_[x_offset]; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto reciprocal_x_norm_square = 1 / x_norm_square; for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y[i] / xy_norm_prod - z * x[i] / x_norm_square); + dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); } } @@ -173,10 +161,10 @@ struct CosSimGradFunctor { const size_t cols_; }; -template +template struct CosSimDxFunctor { CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dx, int cols) + const T* z, const T* dz, T* dx, T* dy, int cols) : x_norm_(x_norm), y_norm_(y_norm), x_(x), @@ -184,58 +172,34 @@ struct CosSimDxFunctor { z_(z), dz_(dz), dx_(dx), - cols_(static_cast(cols)) {} - - void operator()(const T& x_norm, const T& y_norm) const { - size_t x_offset = &x_norm - x_norm_; - - auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; - auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; - auto dz = dz_[x_offset]; - auto z = z_[x_offset]; - - auto* dx = dx_ + cols_ * x_offset; - auto* x = x_ + cols_ * x_offset; - - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y_[i] / xy_norm_prod - z * x[i] / x_norm_square); - } - } - - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dx_; - const size_t cols_; -}; - -template -struct CosSimDyFunctor { - CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dy, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), dy_(dy), cols_(static_cast(cols)) {} - void operator()(const T& x_norm, const T& y_norm) const { + inline void operator()(const T& x_norm, const T& y_norm) const { size_t x_offset = &x_norm - x_norm_; - auto y_norm_square = y_norm_[0] * y_norm_[0]; auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; auto dz = dz_[x_offset]; auto z = z_[x_offset]; auto* x = x_ + cols_ * x_offset; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - for (size_t i = 0; i < cols_; ++i) { - dy_[i] += dz * (x[i] / xy_norm_prod - z * y_[i] / y_norm_square); + if (Dx) { + auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; + auto* dx = dx_ + cols_ * x_offset; + auto* x = x_ + cols_ * x_offset; + auto reciprocal_x_norm_square = 1 / x_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } else { + auto y_norm_square = y_norm_[0] * y_norm_[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dy_[i] += dz * (x[i] * reciprocal_xy_norm_prod - + z * y_[i] * reciprocal_y_norm_square); + } } } @@ -245,6 +209,7 @@ struct CosSimDyFunctor { const T* y_; const T* z_; const T* dz_; + T* dx_; T* dy_; const size_t cols_; }; @@ -287,17 +252,17 @@ class CosSimGradKernel : public framework::OpKernel { } } else { if (out_grad_x) { - CosSimDxFunctor functor( + CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), - out_grad_x->mutable_data(context.GetPlace()), cols); + out_grad_x->mutable_data(context.GetPlace()), nullptr, cols); ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, in_y_norm->data(), functor); } if (out_grad_y) { - CosSimDyFunctor functor( + CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), - in_y->data(), in_z->data(), in_grad_z->data(), + in_y->data(), in_z->data(), in_grad_z->data(), nullptr, out_grad_y->mutable_data(context.GetPlace()), cols); ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, in_y_norm->data(), functor); From 7ac00dd684b025a8b1ea6a34a4cdf39ce7fd792e Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 27 Dec 2017 15:23:49 +0800 Subject: [PATCH 4/8] refine --- paddle/operators/cos_sim_op.cc | 38 +++++++++ paddle/operators/cos_sim_op.cu | 45 +++++++++++ paddle/operators/cos_sim_op.h | 137 ++++++++++++++------------------- 3 files changed, 142 insertions(+), 78 deletions(-) diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 440c427cba..ab9cf745e3 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -149,6 +149,44 @@ class CosSimOpGrad : public framework::OperatorWithKernel { } }; +template +struct CosSimDyFunctor { + CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dy, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dy_(dy), + cols_(static_cast(cols)) {} + + inline void operator()(size_t offset) const { + auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; + auto dz = dz_[offset]; + auto z = z_[offset]; + auto* x = x_ + cols_ * offset; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm_[0] * y_norm_[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dy_[i] += dz * (x[i] * reciprocal_xy_norm_prod - + z * y_[i] * reciprocal_y_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dy_; + const size_t cols_; +}; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 1cb01f5945..eacac68bac 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -15,6 +15,51 @@ #define EIGEN_USE_GPU #include "paddle/operators/cos_sim_op.h" +namespace paddle { +namespace operators { + +template +struct CosSimDyFunctor { + CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dy, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dy_(dy), + cols_(static_cast(cols)) {} + + inline void operator()(size_t offset) const { + auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; + auto dz = dz_[offset]; + auto z = z_[offset]; + auto* x = x_ + cols_ * offset; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm_[0] * y_norm_[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols_; ++i) { + T dy = dz * (x[i] * reciprocal_xy_norm_prod - + z * y_[i] * reciprocal_y_norm_square); + paddle::paddleAtomicAdd(dy_ + i, dy) + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dy_; + const size_t cols_; +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( cos_sim, ops::CosSimKernel); diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index cd5c703c30..8b2a06a41b 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -21,10 +21,17 @@ namespace operators { using Tensor = framework::Tensor; -template -static void ForEachZip(IT1 begin1, IT1 last1, IT2 begin2, Callback callback) { - for (; begin1 < last1; ++begin1, ++begin2) { - callback(*begin1, *begin2); +template +struct CosSimDyFunctor { + CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dy, int cols); + inline void operator()(size_t) const; +}; + +template +static void ForEachZip(size_t num, Callback callback) { + for (size_t i = 0; i < num; ++i) { + callback(i); } } @@ -38,16 +45,11 @@ struct CosSimFunctor { z_(z), cols_(static_cast(cols)) {} - inline void operator()(T& x_norm, T& y_norm) const { - size_t x_offset = &x_norm - x_norm_; - size_t y_offset = &y_norm - y_norm_; - - auto* x = x_ + cols_ * x_offset; - - T xx = 0, xy = 0; - T yy = 0; + inline HOSTDEVICE void operator()(size_t offset) const { + auto* x = x_ + cols_ * offset; + T xx = 0, xy = 0, yy = 0; if (same_row) { - auto* y = y_ + cols_ * y_offset; + auto* y = y_ + cols_ * offset; for (size_t i = 0; i < cols_; ++i) { xx += x[i] * x[i]; yy += y[i] * y[i]; @@ -55,21 +57,20 @@ struct CosSimFunctor { } xx = sqrt(xx); yy = sqrt(yy); - x_norm_[x_offset] = xx; - y_norm_[y_offset] = yy; - z_[x_offset] = xy / (xx * yy); + y_norm_[offset] = yy; + x_norm_[offset] = xx; + z_[offset] = xy / (xx * yy); } else { // This can be wrote in a better way. - auto* y = y_; for (size_t i = 0; i < cols_; ++i) { xx += x[i] * x[i]; - yy += y[i] * y[i]; // only need - xy += x[i] * y[i]; + yy += y_[i] * y_[i]; // only need + xy += x[i] * y_[i]; } xx = sqrt(xx); yy = sqrt(yy); - x_norm_[x_offset] = xx; y_norm_[0] = yy; - z_[x_offset] = xy / (xx * yy); + x_norm_[offset] = xx; + z_[offset] = xy / (xx * yy); } } @@ -104,14 +105,12 @@ class CosSimKernel : public framework::OpKernel { CosSimFunctor functor( in_x->data(), in_y->data(), out_x_norm->data(), out_y_norm->data(), out_z->data(), cols); - ForEachZip(out_x_norm->data(), out_x_norm->data() + rows_x, - out_y_norm->data(), functor); + ForEachZip(rows_x, functor); } else { CosSimFunctor functor( in_x->data(), in_y->data(), out_x_norm->data(), out_y_norm->data(), out_z->data(), cols); - ForEachZip(out_x_norm->data(), out_x_norm->data() + rows_x, - out_y_norm->data(), functor); + ForEachZip(rows_x, functor); } } }; @@ -129,19 +128,15 @@ struct CosSimGradFunctor { dx_(dx), cols_(static_cast(cols)) {} - inline void operator()(const T& x_norm, const T& y_norm) const { - size_t x_offset = &x_norm - x_norm_; - size_t y_offset = &y_norm - y_norm_; + inline HOSTDEVICE void operator()(size_t offset) const { + auto x_norm_square = x_norm_[offset] * x_norm_[offset]; + auto xy_norm_prod = x_norm_[offset] * y_norm_[offset]; + auto dz = dz_[offset]; + auto z = z_[offset]; - auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; - auto xy_norm_prod = x_norm_[x_offset] * y_norm_[y_offset]; - auto dz = dz_[x_offset]; - auto z = z_[x_offset]; - - auto* dx = dx_ + cols_ * x_offset; - auto* x = x_ + cols_ * x_offset; - - auto* y = y_ + cols_ * y_offset; + auto* dx = dx_ + cols_ * offset; + auto* x = x_ + cols_ * offset; + auto* y = y_ + cols_ * offset; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; auto reciprocal_x_norm_square = 1 / x_norm_square; @@ -161,10 +156,10 @@ struct CosSimGradFunctor { const size_t cols_; }; -template +template struct CosSimDxFunctor { CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dx, T* dy, int cols) + const T* z, const T* dz, T* dx, int cols) : x_norm_(x_norm), y_norm_(y_norm), x_(x), @@ -172,37 +167,23 @@ struct CosSimDxFunctor { z_(z), dz_(dz), dx_(dx), - dy_(dy), cols_(static_cast(cols)) {} - inline void operator()(const T& x_norm, const T& y_norm) const { - size_t x_offset = &x_norm - x_norm_; - - auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; - auto dz = dz_[x_offset]; - auto z = z_[x_offset]; - auto* x = x_ + cols_ * x_offset; + inline HOSTDEVICE void operator()(size_t offset) const { + auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; + auto dz = dz_[offset]; + auto z = z_[offset]; + auto* x = x_ + cols_ * offset; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto x_norm_square = x_norm_[offset] * x_norm_[offset]; + auto* dx = dx_ + cols_ * offset; + auto reciprocal_x_norm_square = 1 / x_norm_square; - if (Dx) { - auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; - auto* dx = dx_ + cols_ * x_offset; - auto* x = x_ + cols_ * x_offset; - auto reciprocal_x_norm_square = 1 / x_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - - z * x[i] * reciprocal_x_norm_square); - } - } else { - auto y_norm_square = y_norm_[0] * y_norm_[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dy_[i] += dz * (x[i] * reciprocal_xy_norm_prod - - z * y_[i] * reciprocal_y_norm_square); - } + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); } } - const T* x_norm_; const T* y_norm_; const T* x_; @@ -210,7 +191,6 @@ struct CosSimDxFunctor { const T* z_; const T* dz_; T* dx_; - T* dy_; const size_t cols_; }; @@ -239,33 +219,34 @@ class CosSimGradKernel : public framework::OpKernel { in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), out_grad_x->mutable_data(context.GetPlace()), cols); - ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, - in_y_norm->data(), functor); + ForEachZip(rows_x, functor); } if (out_grad_y) { CosSimGradFunctor functor( in_y_norm->data(), in_x_norm->data(), in_y->data(), in_x->data(), in_z->data(), in_grad_z->data(), out_grad_y->mutable_data(context.GetPlace()), cols); - ForEachZip(in_y_norm->data(), in_y_norm->data() + rows_x, - in_x_norm->data(), functor); + ForEachZip(rows_x, functor); } } else { if (out_grad_x) { - CosSimDxFunctor functor( + CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), - out_grad_x->mutable_data(context.GetPlace()), nullptr, cols); - ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, - in_y_norm->data(), functor); + out_grad_x->mutable_data(context.GetPlace()), cols); + ForEachZip(rows_x, functor); } if (out_grad_y) { - CosSimDxFunctor functor( + out_grad_y->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, out_grad_y, static_cast(0)); + + CosSimDyFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), - in_y->data(), in_z->data(), in_grad_z->data(), nullptr, - out_grad_y->mutable_data(context.GetPlace()), cols); - ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, - in_y_norm->data(), functor); + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_y->data(), cols); + ForEachZip(rows_x, functor); } } } From de26ae416cce48705c930fcbeecd4e556e57a420 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 27 Dec 2017 23:51:23 +0800 Subject: [PATCH 5/8] add gpu code --- paddle/operators/cos_sim_op.cc | 50 +++++++++----------------- paddle/operators/cos_sim_op.cu | 66 +++++++++++++++++----------------- paddle/operators/cos_sim_op.h | 20 +++++------ 3 files changed, 59 insertions(+), 77 deletions(-) diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 80e0780030..77492e60f2 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -151,42 +151,26 @@ class CosSimOpGrad : public framework::OperatorWithKernel { template struct CosSimDyFunctor { - CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dy, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dy_(dy), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t offset) const { - auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; - auto dz = dz_[offset]; - auto z = z_[offset]; - auto* x = x_ + cols_ * offset; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - - auto y_norm_square = y_norm_[0] * y_norm_[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dy_[i] += dz * (x[i] * reciprocal_xy_norm_prod - - z * y_[i] * reciprocal_y_norm_square); + inline void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + for (size_t offset = 0; offset < rows; ++offset) { + auto xy_norm_prod = x_norm[offset] * y_norm[0]; + auto dz_data = dz[offset]; + auto z_data = z[offset]; + auto* x_data = x + cols * offset; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm[0] * y_norm[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + } } } - - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dy_; - const size_t cols_; }; - } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 88f49c1b14..42194d7a05 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -20,45 +20,45 @@ namespace paddle { namespace operators { template -struct CosSimDyFunctor { - CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dy, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dy_(dy), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t offset) const { - auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; - auto dz = dz_[offset]; - auto z = z_[offset]; - auto* x = x_ + cols_ * offset; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; +__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, + const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) { + int grid_size = blockDim.x * gridDim.x; + T y_norm_data = y_norm[0]; + for (int offset = blockIdx.x * blockDim.x + threadIdx.x; offset < rows; + offset += grid_size) { + T xy_norm_prod = x_norm[offset] * y_norm_data; + T dz_data = dz[offset]; + T z_data = z[offset]; + const T* x_data = x + cols * offset; + T reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto y_norm_square = y_norm_[0] * y_norm_[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols_; ++i) { - T dy = dz * (x[i] * reciprocal_xy_norm_prod - - z * y_[i] * reciprocal_y_norm_square); - // platform::CudaAtomicAdd(dy_ + i, dy); - dy_[i] += dy; + T y_norm_square = y_norm_data * y_norm_data; + T reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + platform::CudaAtomicAdd(dy + i, dy_data); } } +} - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dy_; - const size_t cols_; +template +struct CosSimDyFunctor { + inline void operator()(const platform::CUDADeviceContext& ctx, + const T* x_norm, const T* y_norm, const T* x, + const T* y, const T* z, const T* dz, const size_t rows, + const size_t cols, T* dy) const { + const int block_size = 512; + dim3 threads(block_size, 1); + dim3 grid(1, (rows + block_size - 1) / block_size); + CosSimDyKernel<<>>( + x_norm, y_norm, x, y, z, dz, rows, cols, dy); + } }; +template struct CosSimDyFunctor; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index bb7c893a29..a913e576f9 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -193,9 +193,10 @@ struct CosSimDxFunctor { template struct CosSimDyFunctor { - CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dy, int cols); - inline HOSTDEVICE void operator()(size_t) const; + inline void operator()(const DeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const; }; template @@ -255,14 +256,11 @@ class CosSimGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); set_zero(dev_ctx, out_grad_y, static_cast(0)); - CosSimDyFunctor functor( - in_x_norm->data(), in_y_norm->data(), in_x->data(), - in_y->data(), in_z->data(), in_grad_z->data(), - out_grad_y->data(), cols); - platform::ForRange for_range( - static_cast(context.device_context()), - rows_x); - for_range(functor); + CosSimDyFunctor functor; + functor(dev_ctx, in_x_norm->data(), in_y_norm->data(), + in_x->data(), in_y->data(), in_z->data(), + in_grad_z->data(), static_cast(rows_x), + static_cast(cols), out_grad_y->data()); } } } From 8bd759007a29aaece5e85b90362d8c886a8a7e5b Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Thu, 28 Dec 2017 12:53:00 +0800 Subject: [PATCH 6/8] refine CosSimDyFunctor --- paddle/operators/cos_sim_op.cu | 2 -- paddle/operators/cos_sim_op.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 42194d7a05..86dc04995a 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -57,8 +57,6 @@ struct CosSimDyFunctor { } }; -template struct CosSimDyFunctor; - } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index a913e576f9..7641ca15f1 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -56,7 +56,7 @@ struct CosSimFunctor { tep_x = x[i]; tep_y = y_[i]; xx += tep_x * tep_x; - yy += tep_y * tep_y; // only need + yy += tep_y * tep_y; xy += tep_x * tep_y; } xx = sqrt(xx); From 4a11fdb4ef698bb757ad310b53592c0968893b95 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 29 Dec 2017 15:07:07 +0800 Subject: [PATCH 7/8] follow comments --- paddle/operators/cos_sim_op.cc | 10 +++---- paddle/operators/cos_sim_op.cu | 12 ++++----- paddle/operators/cos_sim_op.h | 48 +++++++++++++++++----------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 77492e60f2..d4f3ca5e32 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -155,11 +155,11 @@ struct CosSimDyFunctor { const T* y_norm, const T* x, const T* y, const T* z, const T* dz, const size_t rows, const size_t cols, T* dy) const { - for (size_t offset = 0; offset < rows; ++offset) { - auto xy_norm_prod = x_norm[offset] * y_norm[0]; - auto dz_data = dz[offset]; - auto z_data = z[offset]; - auto* x_data = x + cols * offset; + for (size_t row_id = 0; row_id < rows; ++row_id) { + auto xy_norm_prod = x_norm[row_id] * y_norm[0]; + auto dz_data = dz[row_id]; + auto z_data = z[row_id]; + auto* x_data = x + cols * row_id; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; auto y_norm_square = y_norm[0] * y_norm[0]; diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 86dc04995a..891436c948 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -25,12 +25,12 @@ __global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, const size_t rows, const size_t cols, T* dy) { int grid_size = blockDim.x * gridDim.x; T y_norm_data = y_norm[0]; - for (int offset = blockIdx.x * blockDim.x + threadIdx.x; offset < rows; - offset += grid_size) { - T xy_norm_prod = x_norm[offset] * y_norm_data; - T dz_data = dz[offset]; - T z_data = z[offset]; - const T* x_data = x + cols * offset; + for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows; + row_id += grid_size) { + T xy_norm_prod = x_norm[row_id] * y_norm_data; + T dz_data = dz[row_id]; + T z_data = z[row_id]; + const T* x_data = x + cols * row_id; T reciprocal_xy_norm_prod = 1 / xy_norm_prod; T y_norm_square = y_norm_data * y_norm_data; diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 7641ca15f1..160edb0b56 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -32,11 +32,11 @@ struct CosSimFunctor { z_(z), cols_(static_cast(cols)) {} - inline HOSTDEVICE void operator()(size_t offset) const { - auto* x = x_ + cols_ * offset; + inline HOSTDEVICE void operator()(size_t row_id) const { + auto* x = x_ + cols_ * row_id; T xx = 0, xy = 0, yy = 0; if (same_row) { - auto* y = y_ + cols_ * offset; + auto* y = y_ + cols_ * row_id; T tep_x, tep_y; for (size_t i = 0; i < cols_; ++i) { tep_x = x[i]; @@ -47,9 +47,9 @@ struct CosSimFunctor { } xx = sqrt(xx); yy = sqrt(yy); - y_norm_[offset] = yy; - x_norm_[offset] = xx; - z_[offset] = xy / (xx * yy); + y_norm_[row_id] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); } else { // This can be wrote in a better way. T tep_x, tep_y; for (size_t i = 0; i < cols_; ++i) { @@ -61,9 +61,9 @@ struct CosSimFunctor { } xx = sqrt(xx); yy = sqrt(yy); - if (offset == 0) y_norm_[0] = yy; - x_norm_[offset] = xx; - z_[offset] = xy / (xx * yy); + if (row_id == 0) y_norm_[0] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); } } @@ -125,15 +125,15 @@ struct CosSimGradFunctor { dx_(dx), cols_(static_cast(cols)) {} - inline HOSTDEVICE void operator()(size_t offset) const { - auto x_norm_square = x_norm_[offset] * x_norm_[offset]; - auto xy_norm_prod = x_norm_[offset] * y_norm_[offset]; - auto dz = dz_[offset]; - auto z = z_[offset]; + inline HOSTDEVICE void operator()(size_t row_id) const { + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; - auto* dx = dx_ + cols_ * offset; - auto* x = x_ + cols_ * offset; - auto* y = y_ + cols_ * offset; + auto* dx = dx_ + cols_ * row_id; + auto* x = x_ + cols_ * row_id; + auto* y = y_ + cols_ * row_id; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; auto reciprocal_x_norm_square = 1 / x_norm_square; @@ -166,14 +166,14 @@ struct CosSimDxFunctor { dx_(dx), cols_(static_cast(cols)) {} - inline HOSTDEVICE void operator()(size_t offset) const { - auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; - auto dz = dz_[offset]; - auto z = z_[offset]; - auto* x = x_ + cols_ * offset; + inline HOSTDEVICE void operator()(size_t row_id) const { + auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + auto* x = x_ + cols_ * row_id; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto x_norm_square = x_norm_[offset] * x_norm_[offset]; - auto* dx = dx_ + cols_ * offset; + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto* dx = dx_ + cols_ * row_id; auto reciprocal_x_norm_square = 1 / x_norm_square; for (size_t i = 0; i < cols_; ++i) { From 24cf2fcd90a8409da2e5e38118c73eb4af13121f Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 29 Dec 2017 15:16:49 +0800 Subject: [PATCH 8/8] move cos_sim_functor to math --- paddle/operators/CMakeLists.txt | 4 +- paddle/operators/cos_sim_op.cc | 22 --- paddle/operators/cos_sim_op.cu | 45 ------ paddle/operators/cos_sim_op.h | 153 +-------------------- paddle/operators/math/CMakeLists.txt | 2 + paddle/operators/math/cos_sim_functor.cc | 48 +++++++ paddle/operators/math/cos_sim_functor.cu | 64 +++++++++ paddle/operators/math/cos_sim_functor.h | 166 +++++++++++++++++++++++ 8 files changed, 290 insertions(+), 214 deletions(-) create mode 100644 paddle/operators/math/cos_sim_functor.cc create mode 100644 paddle/operators/math/cos_sim_functor.cu create mode 100644 paddle/operators/math/cos_sim_functor.h diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 5aaaf99332..c6da04b5b4 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -210,7 +210,8 @@ set(DEPS_OPS save_op load_op send_op - recv_op) + recv_op + cos_sim_op) if(WITH_DISTRIBUTE) add_subdirectory(detail) @@ -256,6 +257,7 @@ op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) +op_library(cos_sim_op DEPS cos_sim_functor) # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index d4f3ca5e32..9019a1edb3 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -149,28 +149,6 @@ class CosSimOpGrad : public framework::OperatorWithKernel { } }; -template -struct CosSimDyFunctor { - inline void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm, - const T* y_norm, const T* x, const T* y, const T* z, - const T* dz, const size_t rows, const size_t cols, - T* dy) const { - for (size_t row_id = 0; row_id < rows; ++row_id) { - auto xy_norm_prod = x_norm[row_id] * y_norm[0]; - auto dz_data = dz[row_id]; - auto z_data = z[row_id]; - auto* x_data = x + cols * row_id; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - - auto y_norm_square = y_norm[0] * y_norm[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols; ++i) { - dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - - z_data * y[i] * reciprocal_y_norm_square); - } - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 891436c948..9e5d1b6e4f 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -14,51 +14,6 @@ limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/operators/cos_sim_op.h" -#include "paddle/platform/cuda_helper.h" - -namespace paddle { -namespace operators { - -template -__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, - const T* y, const T* z, const T* dz, - const size_t rows, const size_t cols, T* dy) { - int grid_size = blockDim.x * gridDim.x; - T y_norm_data = y_norm[0]; - for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows; - row_id += grid_size) { - T xy_norm_prod = x_norm[row_id] * y_norm_data; - T dz_data = dz[row_id]; - T z_data = z[row_id]; - const T* x_data = x + cols * row_id; - T reciprocal_xy_norm_prod = 1 / xy_norm_prod; - - T y_norm_square = y_norm_data * y_norm_data; - T reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols; ++i) { - T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - - z_data * y[i] * reciprocal_y_norm_square); - platform::CudaAtomicAdd(dy + i, dy_data); - } - } -} - -template -struct CosSimDyFunctor { - inline void operator()(const platform::CUDADeviceContext& ctx, - const T* x_norm, const T* y_norm, const T* x, - const T* y, const T* z, const T* dz, const size_t rows, - const size_t cols, T* dy) const { - const int block_size = 512; - dim3 threads(block_size, 1); - dim3 grid(1, (rows + block_size - 1) / block_size); - CosSimDyKernel<<>>( - x_norm, y_norm, x, y, z, dz, rows, cols, dy); - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 160edb0b56..eadcca55f9 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/cos_sim_functor.h" #include "paddle/operators/math/math_function.h" #include "paddle/platform/for_range.h" @@ -22,59 +23,6 @@ namespace operators { using Tensor = framework::Tensor; -template -struct CosSimFunctor { - CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto* x = x_ + cols_ * row_id; - T xx = 0, xy = 0, yy = 0; - if (same_row) { - auto* y = y_ + cols_ * row_id; - T tep_x, tep_y; - for (size_t i = 0; i < cols_; ++i) { - tep_x = x[i]; - tep_y = y[i]; - xx += tep_x * tep_x; - yy += tep_y * tep_y; - xy += tep_x * tep_y; - } - xx = sqrt(xx); - yy = sqrt(yy); - y_norm_[row_id] = yy; - x_norm_[row_id] = xx; - z_[row_id] = xy / (xx * yy); - } else { // This can be wrote in a better way. - T tep_x, tep_y; - for (size_t i = 0; i < cols_; ++i) { - tep_x = x[i]; - tep_y = y_[i]; - xx += tep_x * tep_x; - yy += tep_y * tep_y; - xy += tep_x * tep_y; - } - xx = sqrt(xx); - yy = sqrt(yy); - if (row_id == 0) y_norm_[0] = yy; - x_norm_[row_id] = xx; - z_[row_id] = xy / (xx * yy); - } - } - - T* x_norm_; - T* y_norm_; - const T* x_; - const T* y_; - T* z_; - const size_t cols_; -}; - template class CosSimKernel : public framework::OpKernel { public: @@ -95,14 +43,14 @@ class CosSimKernel : public framework::OpKernel { int cols = framework::product(in_x->dims()) / rows_x; if (rows_x == rows_y) { - CosSimFunctor functor( + math::CosSimFunctor functor( in_x->data(), in_y->data(), out_x_norm->data(), out_y_norm->data(), out_z->data(), cols); platform::ForRange for_range( static_cast(context.device_context()), rows_x); for_range(functor); } else { - CosSimFunctor functor( + math::CosSimFunctor functor( in_x->data(), in_y->data(), out_x_norm->data(), out_y_norm->data(), out_z->data(), cols); platform::ForRange for_range( @@ -112,93 +60,6 @@ class CosSimKernel : public framework::OpKernel { } }; -template -struct CosSimGradFunctor { - CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dx, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dx_(dx), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; - auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; - auto dz = dz_[row_id]; - auto z = z_[row_id]; - - auto* dx = dx_ + cols_ * row_id; - auto* x = x_ + cols_ * row_id; - auto* y = y_ + cols_ * row_id; - - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto reciprocal_x_norm_square = 1 / x_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - - z * x[i] * reciprocal_x_norm_square); - } - } - - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dx_; - const size_t cols_; -}; - -template -struct CosSimDxFunctor { - CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dx, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dx_(dx), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; - auto dz = dz_[row_id]; - auto z = z_[row_id]; - auto* x = x_ + cols_ * row_id; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; - auto* dx = dx_ + cols_ * row_id; - auto reciprocal_x_norm_square = 1 / x_norm_square; - - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - - z * x[i] * reciprocal_x_norm_square); - } - } - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dx_; - const size_t cols_; -}; - -template -struct CosSimDyFunctor { - inline void operator()(const DeviceContext& ctx, const T* x_norm, - const T* y_norm, const T* x, const T* y, const T* z, - const T* dz, const size_t rows, const size_t cols, - T* dy) const; -}; - template class CosSimGradKernel : public framework::OpKernel { public: @@ -220,7 +81,7 @@ class CosSimGradKernel : public framework::OpKernel { if (rows_x == rows_y) { if (out_grad_x) { - CosSimGradFunctor functor( + math::CosSimGradFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), out_grad_x->mutable_data(context.GetPlace()), cols); @@ -230,7 +91,7 @@ class CosSimGradKernel : public framework::OpKernel { for_range(functor); } if (out_grad_y) { - CosSimGradFunctor functor( + math::CosSimGradFunctor functor( in_y_norm->data(), in_x_norm->data(), in_y->data(), in_x->data(), in_z->data(), in_grad_z->data(), out_grad_y->mutable_data(context.GetPlace()), cols); @@ -241,7 +102,7 @@ class CosSimGradKernel : public framework::OpKernel { } } else { if (out_grad_x) { - CosSimDxFunctor functor( + math::CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), out_grad_x->mutable_data(context.GetPlace()), cols); @@ -256,7 +117,7 @@ class CosSimGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); set_zero(dev_ctx, out_grad_y, static_cast(0)); - CosSimDyFunctor functor; + math::CosSimDyFunctor functor; functor(dev_ctx, in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), static_cast(rows_x), diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index bf47879f77..830ae53cbe 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -16,6 +16,7 @@ if(WITH_GPU) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) + nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context) else() cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) @@ -30,6 +31,7 @@ else() cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(unpooling SRCS unpooling.cc DEPS device_context) cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function) + cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context) endif() cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) diff --git a/paddle/operators/math/cos_sim_functor.cc b/paddle/operators/math/cos_sim_functor.cc new file mode 100644 index 0000000000..f52a82b108 --- /dev/null +++ b/paddle/operators/math/cos_sim_functor.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/cos_sim_functor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct CosSimDyFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + for (size_t row_id = 0; row_id < rows; ++row_id) { + auto xy_norm_prod = x_norm[row_id] * y_norm[0]; + auto dz_data = dz[row_id]; + auto z_data = z[row_id]; + auto* x_data = x + cols * row_id; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm[0] * y_norm[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + } + } + } +}; + +template class CosSimDyFunctor; +template class CosSimDyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/cos_sim_functor.cu b/paddle/operators/math/cos_sim_functor.cu new file mode 100644 index 0000000000..fb19a8b38a --- /dev/null +++ b/paddle/operators/math/cos_sim_functor.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/cos_sim_functor.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, + const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) { + int grid_size = blockDim.x * gridDim.x; + T y_norm_data = y_norm[0]; + for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows; + row_id += grid_size) { + T xy_norm_prod = x_norm[row_id] * y_norm_data; + T dz_data = dz[row_id]; + T z_data = z[row_id]; + const T* x_data = x + cols * row_id; + T reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + T y_norm_square = y_norm_data * y_norm_data; + T reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + platform::CudaAtomicAdd(dy + i, dy_data); + } + } +} + +template +struct CosSimDyFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + const int block_size = 512; + dim3 threads(block_size, 1); + dim3 grid(1, (rows + block_size - 1) / block_size); + CosSimDyKernel<<>>( + x_norm, y_norm, x, y, z, dz, rows, cols, dy); + } +}; + +template class CosSimDyFunctor; +template class CosSimDyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/cos_sim_functor.h b/paddle/operators/math/cos_sim_functor.h new file mode 100644 index 0000000000..aae8ab5b7a --- /dev/null +++ b/paddle/operators/math/cos_sim_functor.h @@ -0,0 +1,166 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/platform/device_context.h" +#include "paddle/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct CosSimFunctor { + CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto* x = x_ + cols_ * row_id; + T xx = 0, xy = 0, yy = 0; + if (same_row) { + auto* y = y_ + cols_ * row_id; + T tep_x, tep_y; + for (size_t i = 0; i < cols_; ++i) { + tep_x = x[i]; + tep_y = y[i]; + xx += tep_x * tep_x; + yy += tep_y * tep_y; + xy += tep_x * tep_y; + } + xx = sqrt(xx); + yy = sqrt(yy); + y_norm_[row_id] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); + } else { // This can be wrote in a better way. + T tep_x, tep_y; + for (size_t i = 0; i < cols_; ++i) { + tep_x = x[i]; + tep_y = y_[i]; + xx += tep_x * tep_x; + yy += tep_y * tep_y; + xy += tep_x * tep_y; + } + xx = sqrt(xx); + yy = sqrt(yy); + if (row_id == 0) y_norm_[0] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); + } + } + + T* x_norm_; + T* y_norm_; + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; + +template +struct CosSimGradFunctor { + CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + + auto* dx = dx_ + cols_ * row_id; + auto* x = x_ + cols_ * row_id; + auto* y = y_ + cols_ * row_id; + + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto reciprocal_x_norm_square = 1 / x_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDxFunctor { + CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + auto* x = x_ + cols_ * row_id; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto* dx = dx_ + cols_ * row_id; + auto reciprocal_x_norm_square = 1 / x_norm_square; + + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDyFunctor { + void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm, + const T* x, const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) const; +}; + +} // namespace math +} // namespace operators +} // namespace paddle