Merge pull request #239 from hedaoyuan/tensor

Add TensorExpression
avx_docs
Yu Yang 8 years ago committed by GitHub
commit 82774dbbd3

@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_TYPE_CUH_
#define HL_MATRIX_TYPE_CUH_
#include "hl_base.h"
#ifdef __CUDA_ARCH__
// typedef void* vecType;
#include <vector_types.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef float4 vecType;
@ -37,4 +35,10 @@ typedef __m128d vecType;
#endif
#endif
#endif /* HL_MATRIX_TYPE_CUH_ */
#ifdef __CUDA_ARCH__
#define INLINE __device__ inline
#else
#define INLINE inline
#endif
#endif // HL_MATRIX_TYPE_CUH_

File diff suppressed because it is too large Load Diff

@ -289,7 +289,7 @@ void forward(Argument& act) {
useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->abs(*act.value);
act.value->abs2(*act.value);
}
void backward(Argument& act) { act.grad->absDerivative(*act.in); }
@ -311,7 +311,7 @@ void forward(Argument& act) {
useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->square(*act.value);
act.value->square2(*act.value);
}
void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
@ -324,7 +324,7 @@ END_DEFINE_ACTIVATION(square)
* \f]
*/
BEGIN_DEFINE_ACTIVATION(exponential)
void forward(Argument& act) { act.value->exp(*act.value); }
void forward(Argument& act) { act.value->exp2(*act.value); }
void backward(Argument& act) { act.grad->expDerivative(*act.value); }
END_DEFINE_ACTIVATION(exponential)
@ -345,7 +345,7 @@ void forward(Argument& act) {
useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->log(*act.value);
act.value->log2(*act.value);
}
void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }

@ -40,7 +40,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
savedMean_->mulScalar(1.0 / numSamples); // E[x]
tmpMat_->assign(*mat);
tmpMat_->square();
tmpMat_->square2();
savedInvVar_->zeroMem();
savedInvVar_->accumulateColSum(*tmpMat_);
savedInvVar_->mulScalar(1.0 / numSamples); // E[x^2]
@ -54,7 +54,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
calMovingMeanAndVar();
savedInvVar_->subScalar(-EPS);
savedInvVar_->sqrt(*savedInvVar_);
savedInvVar_->sqrt2(*savedInvVar_);
}
void BatchNormalizationLayer::calMovingMeanAndVar() {
@ -85,7 +85,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
savedInvVar_->downClip(real(0.0));
savedInvVar_->subScalar(-EPS);
savedInvVar_->sqrt(*savedInvVar_);
savedInvVar_->sqrt2(*savedInvVar_);
}
void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {

@ -115,12 +115,12 @@ void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
Matrix& target) {
Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
output.rowSum(*sftMaxSum_);
sftMaxSum_->log();
sftMaxSum_->log2();
target.oneHotCrossEntropy(output, *label.ids);
target.add(*sftMaxSum_);
sftMaxSum_->square();
sftMaxSum_->square2();
target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
}
@ -131,12 +131,12 @@ void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
output.rowSum(*sftMaxSum_);
Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
sftMaxSum_->reciprocal(*sumInv_);
sftMaxSum_->reciprocal2(*sumInv_);
outputG.oneHotCrossEntropyBp(output, *label.ids);
outputG.addColumnVector(*sumInv_);
sftMaxSum_->log();
sftMaxSum_->log2();
sumInv_->dotMul(*sumInv_, *sftMaxSum_);
sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());

@ -316,12 +316,12 @@ void Layer::showOutputStats() {
auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
min = tmpMat->getMin();
max = tmpMat->getMax();
tmpMat->square();
tmpMat->square2();
LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
} else {
min = outSquare->getMin();
max = outSquare->getMax();
outSquare->square();
outSquare->square2();
}
real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
std = std > 0 ? std : 0;

@ -60,7 +60,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
expX_->assign(*matX);
// subtract max to avoid overflow or underflow
expX_->mul(maxX_, ones_, (real)-1, (real)1);
expX_->exp();
expX_->exp2();
real* a = a_->getData();
real* b = b_->getData();
@ -69,7 +69,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
real* expX = expX_->getData();
real* maxX = maxX_->getData();
expW_->exp(*w_);
expW_->exp2(*w_);
real* expW = expW_->getData();
for (int i = 0; i < numClasses_; ++i) {

@ -99,7 +99,7 @@ void PowerLayer::backward(const UpdateCallback& callback) {
Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
if (inG0) {
tmpMtx->log(*inV1);
tmpMtx->log2(*inV1);
tmpMtx->dotMul(*tmpMtx, *outV);
// inG0 += outG .* (log(inV1) * outV)

@ -355,11 +355,11 @@ void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
template<>
void BaseMatrixT<real>::exp() { applyUnary(unary::Exp<real>()); }
void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
template<>
void BaseMatrixT<real>::log() {
void BaseMatrixT<real>::log2() {
if (useGpu_) {
applyUnary(unary::Log<real>());
} else {
@ -369,23 +369,23 @@ void BaseMatrixT<real>::log() {
DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
template<>
void BaseMatrixT<real>::sqrt() { applyUnary(unary::Sqrt<real>()); }
void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
template<class T>
void BaseMatrixT<T>::square() { applyUnary(unary::Square<T>()); }
void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
template<class T>
void BaseMatrixT<T>::reciprocal() { applyUnary(unary::Reciprocal<T>()); }
void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
template<class T>
void BaseMatrixT<T>::abs() { applyUnary(unary::Abs<T>()); }
void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
template<class T>
void BaseMatrixT<T>::sign() { applyUnary(unary::Sign<T>()); }
void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
template<class T>
@ -405,7 +405,7 @@ void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
template<>
void BaseMatrixT<real>::pow(real p) {
void BaseMatrixT<real>::pow2(real p) {
if (useGpu_) {
applyUnary(unary::Pow<real>(p));
} else {
@ -534,7 +534,7 @@ void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
template<>
void BaseMatrixT<real>::pow(BaseMatrixT& b, real p) {
void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
if (useGpu_) {
applyBinary(binary::Pow<real>(p), b);
} else {
@ -615,7 +615,7 @@ void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
template<class T>
void BaseMatrixT<T>::square(BaseMatrixT& b) {
void BaseMatrixT<T>::square2(BaseMatrixT& b) {
applyBinary(binary::Square<T>(), b);
}
@ -657,7 +657,7 @@ void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
template<class T>
void BaseMatrixT<T>::reciprocal(BaseMatrixT& b) {
void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
applyBinary(binary::Reciprocal<T>(), b);
}
@ -669,7 +669,7 @@ void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
template<class T>
void BaseMatrixT<T>::abs(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
template<class T>
@ -729,17 +729,19 @@ void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
template<class T>
void BaseMatrixT<T>::sign(BaseMatrixT& b) { applyBinary(binary::Sign<T>(), b); }
void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
applyBinary(binary::Sign<T>(), b);
}
DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
template<>
void BaseMatrixT<real>::exp(BaseMatrixT& b) {
void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
applyBinary(binary::Exp<real>(), b);
}
DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
template<>
void BaseMatrixT<real>::log(BaseMatrixT& b) {
void BaseMatrixT<real>::log2(BaseMatrixT& b) {
if (useGpu_) {
applyBinary(binary::Log<real>(), b);
} else {
@ -749,7 +751,7 @@ void BaseMatrixT<real>::log(BaseMatrixT& b) {
DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
template<>
void BaseMatrixT<real>::sqrt(BaseMatrixT& b) {
void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
applyBinary(binary::Sqrt<real>(), b);
}
@ -1065,7 +1067,7 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
template<class T>
void BaseMatrixT<T>::max(BaseMatrixT& b, BaseMatrixT& c) { // NOLINT
void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::Max<T>(), b, c);
}
@ -1168,7 +1170,7 @@ void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
a = 1 / (p1 * b + p2));
template<class T>
void BaseMatrixT<T>::reciprocal(BaseMatrixT& b, T p1, T p2) {
void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
applyBinary(binary::Reciprocal2<T>(p1, p2), b);
}

@ -16,6 +16,7 @@ limitations under the License. */
#include <cstddef>
#include <stdint.h>
#include "paddle/utils/TypeDefs.h"
#include "TensorExpression.h"
namespace paddle {
@ -70,7 +71,7 @@ public:
};
template <class T>
class BaseMatrixT {
class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
public:
size_t height_, width_;
size_t stride_;
@ -427,14 +428,14 @@ public:
*
*/
void neg();
void exp();
void pow(T p);
void log();
void sqrt();
void square();
void reciprocal();
void abs();
void sign();
void exp2();
void pow2(T p);
void log2();
void sqrt2();
void square2();
void reciprocal2();
void abs2();
void sign2();
void zero();
/**
@ -603,7 +604,7 @@ public:
* b = this * this
* @endcode
*/
void square(BaseMatrixT& b);
void square2(BaseMatrixT& b);
void squareDerivative(BaseMatrixT& b);
/**
@ -627,7 +628,7 @@ public:
* b = 1.0f / this
* @endcode
*/
void reciprocal(BaseMatrixT& b);
void reciprocal2(BaseMatrixT& b);
void reciprocalDerivative(BaseMatrixT& b);
/**
@ -635,7 +636,7 @@ public:
* b = this > 0.0f ? this : -this
* @endcode
*/
void abs(BaseMatrixT& b);
void abs2(BaseMatrixT& b);
void absDerivative(BaseMatrixT& b);
/**
@ -653,12 +654,12 @@ public:
*/
void expDerivative(BaseMatrixT& b);
void sign(BaseMatrixT& b);
void sign2(BaseMatrixT& b);
void exp(BaseMatrixT& b);
void pow(BaseMatrixT& b, T p);
void log(BaseMatrixT& b);
void sqrt(BaseMatrixT& b);
void exp2(BaseMatrixT& b);
void pow2(BaseMatrixT& b, T p);
void log2(BaseMatrixT& b);
void sqrt2(BaseMatrixT& b);
void addScalar(BaseMatrixT& b, T p);
void subScalar(BaseMatrixT& b, T p);
void mulScalar(BaseMatrixT& b, T p);
@ -828,7 +829,7 @@ public:
* this = b>c ? b : c
* @endcode
*/
void max(BaseMatrixT& b, BaseMatrixT& c); // NOLINT
void max2(BaseMatrixT& b, BaseMatrixT& c);
/**
* @code
@ -927,7 +928,7 @@ public:
* this = 1 / (p1 * b + p2)
* @endcode
*/
void reciprocal(BaseMatrixT& b, T p1, T p2);
void reciprocal2(BaseMatrixT& b, T p1, T p2);
/**
* @code
@ -1050,6 +1051,32 @@ public:
void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
virtual bool isSparse() const { return false; }
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
if (useGpu_) {
TensorGpuApply<T>(*this, expr);
} else {
TensorCpuApply<T>(*this, expr);
}
}
template <typename ExpressionType>
void operator+=(const ExpressionType& expr) {
(*this) = (*this) + expr;
}
template <typename ExpressionType>
void operator-=(const ExpressionType& expr) {
(*this) = (*this) - expr;
}
template <typename ExpressionType>
void operator*=(const ExpressionType& expr) {
(*this) = (*this) * expr;
}
template <typename ExpressionType>
void operator/=(const ExpressionType& expr) {
(*this) = (*this) / expr;
}
};
typedef BaseMatrixT<real> BaseMatrix;

@ -16,10 +16,12 @@ file(GLOB MATH_HEADERS . *.h)
file(GLOB MATH_SOURCES . *.cpp)
set(MATH_SOURCES
"${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
"${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
${MATH_SOURCES})
if(NOT WITH_GPU)
# then compile BaseMatrix.cu as c++ file
compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
add_library(paddle_math STATIC
${MATH_SOURCES})
else()

@ -136,7 +136,7 @@ public:
return sum;
}
virtual void square() {
virtual void square2() {
CHECK(isContiguous());
if (valueType_ == NO_VALUE) {
return;

@ -1122,6 +1122,7 @@ public:
virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
LOG(FATAL) << "Not implemented";
}
virtual void bilinearForward(const Matrix& in,
const size_t inImgH,
const size_t inImgW,
@ -1142,6 +1143,15 @@ public:
const real ratioW) {
LOG(FATAL) << "Not implemented";
}
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
if (useGpu_) {
TensorGpuApply<real>(*this, expr);
} else {
TensorCpuApply<real>(*this, expr);
}
}
};
inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@ -1518,6 +1528,11 @@ public:
void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
TensorGpuApply<real>(*this, expr);
}
};
class CpuMatrix : public Matrix {
@ -1917,6 +1932,11 @@ public:
const size_t numChannels,
const real ratioH,
const real ratioW);
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
TensorCpuApply<real>(*this, expr);
}
};
class SharedCpuMatrix : public CpuMatrix {
@ -1957,6 +1977,7 @@ public:
void add(real p1, real p2);
private:
using Matrix::mul;
void initShared(int blockNum);
void initBlock(int blockNum);

@ -0,0 +1,211 @@
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
/**
* \brief The tensor evaluator classes.
*/
template <typename Derived, class T>
class TensorApply {
public:
explicit INLINE TensorApply(const Derived& p)
: data_(p.data_),
stride_(p.stride_),
height_(p.height_),
width_(p.width_),
useGpu_(p.useGpu_) {}
INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
INLINE T apply(int index) const { return data_[index]; }
INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
INLINE T& applyRef(int index) { return data_[index]; }
INLINE size_t getWidth() const { return width_; }
INLINE size_t getHeight() const { return height_; }
INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
INLINE bool useGpu() const { return useGpu_; }
T* data_;
size_t stride_;
size_t height_;
size_t width_;
bool useGpu_;
};
/**
* \brief The tensor evaluator classes.
* evaluator for rvalues
*/
template <typename Derived, class T>
class TensorApply<const Derived, T> {
public:
explicit INLINE TensorApply(const Derived& p)
: data_(p.data_),
stride_(p.stride_),
height_(p.height_),
width_(p.width_),
useGpu_(p.useGpu_) {}
INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
INLINE T apply(int index) const { return data_[index]; }
INLINE size_t getWidth() const { return width_; }
INLINE size_t getHeight() const { return height_; }
INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
INLINE bool useGpu() const { return useGpu_; }
const T* data_;
size_t stride_;
size_t height_;
size_t width_;
bool useGpu_;
};
template <typename Derived, class T>
class TensorApply<const TensorExpression<Derived, T>, T> {
public:
explicit TensorApply(const TensorExpression<Derived, T>& expr)
: expr_(expr.derived()) {}
INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
INLINE T apply(int index) const { return expr_.apply(index); }
INLINE size_t getWidth() const { return expr_.getWidth(); }
INLINE size_t getHeight() const { return expr_.getHeight(); }
INLINE bool isContiguous() const { return expr_.isContiguous(); }
INLINE bool useGpu() const { return expr_.useGpu(); }
TensorApply<const Derived, T> expr_;
};
/**
* \brief The unary expression evaluator classes.
*/
template <class OP, typename ArgType, class T>
class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
public:
explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
: op_(expr.op_), expr_(expr.expr_) {}
INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
INLINE T apply(int index) const { return op_(expr_.apply(index)); }
INLINE size_t getWidth() const { return expr_.getWidth(); }
INLINE size_t getHeight() const { return expr_.getHeight(); }
INLINE bool isContiguous() const { return expr_.isContiguous(); }
INLINE bool useGpu() const { return expr_.useGpu(); }
const OP op_;
TensorApply<ArgType, T> expr_;
};
/**
* \brief The binary expression evaluator classes.
*/
template <class OP, typename LhsType, typename RhsType, class T>
class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
public:
explicit INLINE TensorApply(
const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
: op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
#ifndef __CUDA_ARCH__
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
#endif
}
INLINE T apply(int i, int j) const {
return op_(lhs_.apply(i, j), rhs_.apply(i, j));
}
INLINE T apply(int index) const {
return op_(lhs_.apply(index), rhs_.apply(index));
}
INLINE size_t getWidth() const { return lhs_.getWidth(); }
INLINE size_t getHeight() const { return rhs_.getHeight(); }
INLINE bool isContiguous() const {
return lhs_.isContiguous() && rhs_.isContiguous();
}
INLINE bool useGpu() const { return lhs_.useGpu(); }
const OP op_;
TensorApply<LhsType, T> lhs_;
TensorApply<RhsType, T> rhs_;
};
/**
* \brief The ternary expression evaluator classes.
*/
template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
public:
explicit INLINE TensorApply(
const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
: expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
#ifndef __CUDA_ARCH__
CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
#endif
}
INLINE T apply(int i, int j) const {
return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
}
INLINE T apply(int index) const {
return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
}
INLINE size_t getWidth() const { return expr1_.getWidth(); }
INLINE size_t getHeight() const { return expr1_.getHeight(); }
INLINE bool isContiguous() const {
return expr1_.isContiguous() && expr2_.isContiguous() &&
expr3_.isContiguous();
}
INLINE bool useGpu() const { return expr1_.useGpu(); }
TensorApply<ArgType1, T> expr1_;
TensorApply<ArgType2, T> expr2_;
TensorApply<ArgType3, T> expr3_;
};
/**
* \brief The const expression evaluator classes.
*/
template <class OP, typename ArgType, class T>
class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
public:
explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
: op_(expr.op_), expr_(expr.expr_) {}
INLINE T apply(int i, int j) const { return op_(i, j); }
INLINE T apply(int index) const { return op_(index); }
INLINE size_t getWidth() const { return expr_.getWidth(); }
INLINE size_t getHeight() const { return expr_.getHeight(); }
INLINE bool isContiguous() const { return true; }
INLINE bool useGpu() const { return expr_.useGpu(); }
const OP op_;
TensorApply<ArgType, T> expr_;
};
} // namespace paddle

@ -0,0 +1,158 @@
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/utils/Logging.h"
namespace paddle {
/**
* \brief Tensor Assign Expression(return by lazyAssign,
* and evaluated by AssignEvaluate)
*/
template <typename LhsType, typename RhsType, class T>
class TensorAssignOp {
public:
explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
: lhs_(lhs), rhs_(rhs) {
#ifndef __CUDA_ARCH__
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
#endif
}
INLINE void apply(const int i, const int j) {
lhs_.applyRef(i, j) = rhs_.apply(i, j);
}
INLINE void apply(const int index) {
lhs_.applyRef(index) = rhs_.apply(index);
}
INLINE size_t getWidth() const { return lhs_.getWidth(); }
INLINE size_t getHeight() const { return rhs_.getHeight(); }
INLINE bool isContiguous() const {
return lhs_.isContiguous() && rhs_.isContiguous();
}
INLINE bool useGpu() const { return lhs_.useGpu(); }
private:
TensorApply<LhsType, T> lhs_;
TensorApply<const RhsType, T> rhs_;
};
template <typename Assign, typename... AssignOp>
void AssignCpuEvaluate(int height,
int width,
bool isContiguous,
Assign&& assign,
AssignOp&&... args) {
if (isContiguous) {
int size = height * width;
for (int index = 0; index < size; index++) {
assign.apply(index);
__attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
}
} else {
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
assign.apply(i, j);
__attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
}
}
}
}
#ifdef __NVCC__
template <typename Assign, typename... AssignOp>
__global__ void AssignGpuEvaluate1(const int border,
Assign assign,
AssignOp... args) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < border) {
assign.apply(idx);
__attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
}
}
template <typename Assign, typename... AssignOp>
__global__ void AssignGpuEvaluate2(const int height,
const int width,
Assign assign,
AssignOp... args) {
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
assign.apply(i, j);
__attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
}
}
}
#endif
/**
* \brief Evaluate one or more TensorAssignOp objects.
*
* \note At least one assignment expression is required
*/
template <typename Assign, typename... AssignOp>
void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
const bool useGpu_ = assign.useGpu();
bool isContiguous_ = assign.isContiguous();
const size_t height = assign.getHeight();
const size_t width = assign.getWidth();
const int packSize = sizeof...(args);
const bool packUseGpu[] = {((args)).useGpu()...};
const bool packIsContiguous[] = {((args)).isContiguous()...};
const size_t packHeight[] = {((args)).getHeight()...};
const size_t packWidth[] = {((args)).getWidth()...};
for (int i = 0; i < packSize; i++) {
CHECK_EQ(useGpu_, packUseGpu[i]);
CHECK_EQ(height, packHeight[i]);
CHECK_EQ(width, packWidth[i]);
isContiguous_ = isContiguous_ && packIsContiguous[i];
}
if (useGpu_) {
#ifdef __NVCC__
if (isContiguous_) {
int size = height * width;
int blockSize = size <= 1024 ? size : 1024;
int gridSize = (size + 1024 - 1) / 1024;
AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
size, assign, args...);
} else {
int blockSizeY = std::min(32, (int)height);
int blockSizeX = (32 / blockSizeY) * 32;
int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
dim3 threads(blockSizeX, blockSizeY);
dim3 grid(gridSizeX, gridSizeY);
AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
height, width, assign, args...);
}
CHECK_SYNC("AssignEvaluate failed");
#endif
} else {
AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
}
}
} // namespace paddle

@ -0,0 +1,109 @@
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/utils/Logging.h"
#include "hl_base.h"
namespace paddle {
/**
* \brief The tensor cpu evaluate api.
*/
template <class T, typename LeftType, typename RightType>
inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
TensorApply<LeftType, T> lhs_(lhs);
TensorApply<const RightType, T> rhs_(rhs);
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
int height = lhs_.getHeight();
int width = lhs_.getWidth();
if (lhs_.isContiguous() && rhs_.isContiguous()) {
int size = height * width;
for (int index = 0; index < size; index++) {
lhs_.applyRef(index) = rhs_.apply(index);
}
} else {
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
lhs_.applyRef(i, j) = rhs_.apply(i, j);
}
}
}
}
#ifdef __NVCC__
template <typename LeftType, typename RightType>
__global__ void TensorElementWiseOp(LeftType lhs,
RightType rhs,
const int border) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < border) {
lhs.applyRef(idx) = rhs.apply(idx);
}
}
template <typename LeftType, typename RightType>
__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
lhs.applyRef(i, j) = rhs.apply(i, j);
}
}
}
/**
* \brief The tensor gpu evaluate api.
*/
template <class T, typename LeftType, typename RightType>
inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
TensorApply<LeftType, T> lhs_(lhs);
TensorApply<const RightType, T> rhs_(rhs);
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
int dimM = lhs_.getHeight();
int dimN = lhs_.getWidth();
if (lhs_.isContiguous() && rhs_.isContiguous()) {
int size = dimM * dimN;
int blockSize = size <= 1024 ? size : 1024;
int gridSize = (size + 1024 - 1) / 1024;
TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
lhs_, rhs_, size);
} else {
int blockSizeY = std::min(32, dimM);
int blockSizeX = (32 / blockSizeY) * 32;
int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
dim3 threads(blockSizeX, blockSizeY);
dim3 grid(gridSizeX, gridSizeY);
TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
}
CHECK_SYNC("TensorGpuApply failed");
}
#else
template <class T, typename LeftType, typename RightType>
inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
#endif
} // namespace paddle

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,122 @@
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/utils/Logging.h"
#include "BaseMatrix.h"
namespace paddle {
/**
* \brief Sparse Momentum optimizer.
*/
extern void sparseMomentumApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& momU,
BaseMatrix& momV,
real alpha,
real beta,
real gamma,
real tau,
real learningRate);
/**
* \brief AdaDelta optimizer.
*/
extern void adadeltaApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& sum,
BaseMatrix& sum1,
BaseMatrix& mom,
BaseMatrix& lr,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate);
/**
* \brief AdaGrad optimizer.
*/
extern void adagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& sum,
BaseMatrix& sum1,
BaseMatrix& mom,
BaseMatrix& lr,
real epsilon,
real learningRate,
real momentum,
real decayRate);
/**
* \brief RMSProp optimizer.
*/
extern void rmspropApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& g,
BaseMatrix& f,
BaseMatrix& mom,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime);
/**
* \brief Decayed AdaGrad optimizer.
*/
extern void decayedAdagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime);
/**
* \brief Adam optimizer.
*/
extern void adamApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& v,
real beta1,
real beta2,
real beta1_power,
real beta2_power,
real epsilon,
real learningRate);
/**
* \brief AdaMax optimizer.
*/
extern void adamaxApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom, // firse moment
BaseMatrix& u, // weighted infinity norm
real beta1,
real beta2,
int64_t step,
real alpha);
} // namespace paddle

@ -265,6 +265,15 @@ public:
/// print the "idx" element of the Vector
virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
if (BaseVector<T>::useGpu_) {
TensorGpuApply<T>(*this, expr);
} else {
TensorCpuApply<T>(*this, expr);
}
}
protected:
friend class GpuVectorT<T>;
friend class CpuVectorT<T>;
@ -322,6 +331,11 @@ public:
virtual void print(std::ostream& os, size_t num) const;
virtual void printOneElement(std::ostream& os, size_t idx) const;
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
TensorGpuApply<T>(*this, expr);
}
protected:
virtual void copyTo(CpuVectorT<T>* dest) const;
virtual void copyTo(GpuVectorT<T>* dest) const;
@ -385,6 +399,11 @@ public:
virtual T get(size_t pos);
virtual void print(std::ostream& os, size_t num) const;
virtual void printOneElement(std::ostream& os, size_t idx) const;
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
TensorCpuApply<T>(*this, expr);
}
};
template <class T>

@ -2,6 +2,7 @@
add_simple_unittest(test_ExecViaCpu)
add_simple_unittest(test_SIMDFunctions)
add_simple_unittest(test_TrainingAlgorithm)
add_simple_unittest(test_SparseMatrix)
# TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
@ -13,6 +14,21 @@ add_simple_unittest(test_sparseMatrixCompare)
add_simple_unittest(test_perturbation)
add_simple_unittest(test_CpuGpuVector)
add_simple_unittest(test_Allocator)
if(WITH_GPU)
if(COMPILER_SUPPORT_CXX11)
CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
link_paddle_test(test_Tensor)
CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
link_paddle_test(test_lazyAssign)
endif()
else()
compile_cu_as_cpp(test_Tensor.cu)
add_unittest(test_Tensor test_Tensor.cu)
compile_cu_as_cpp(test_lazyAssign.cu)
add_unittest(test_lazyAssign test_lazyAssign.cu)
endif(WITH_GPU)
add_simple_unittest(test_FPException)
add_simple_unittest(test_GpuProfiler)
add_simple_unittest(test_BaseMatrix)

@ -0,0 +1,201 @@
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/utils/GlobalConstants.h"
#include "paddle/math/Vector.h"
using namespace paddle; // NOLINT
void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
real alpha,
real beta,
real gamma,
real tau,
real learningRate) {
vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-alpha * gamma * learningRate);
vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
tau * alpha * gamma * learningRate);
vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
tau / beta + 1.0 / alpha,
*vecs[PARAMETER_MOMENTUM_VT],
1.0 / beta);
}
void AdagradParameterOptimizer(const VectorPtr vecs[],
real epsilon,
real learningRate,
real momentum,
real decayRate) {
vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
1.0f);
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
*vecs[PARAMETER_GRADIENT_SQURESUM1]);
vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate,
momentum,
decayRate);
}
void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
// learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
*vecs[PARAMETER_GRADIENT_SQURESUM],
epsilon,
epsilon);
vecs[PARAMETER_LEARNING_RATE]->sqrt2();
// E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_LEARNING_RATE],
rou,
1.0f - rou);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate,
momentum,
decayRate);
}
void RMSPropParameterOptimizer(const VectorPtr vecs[],
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
// For the first time update, make the sum be the current square
// so that the initial estimation of E(g_t^2) will not be too small.
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
// E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
*vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
// learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
// Basiclly if the sign of the gradient changes more often,
// the learning rate will be decreased.
vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-1.0f);
vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate,
momentum,
decayRate);
}
void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
// For the first time update, make the sum be the current square
// so that the initial estimation of E(g_t^2) will not be too small.
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
// learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
// Basiclly if the bigger the magnitude gradient is,
// the smaller the learning rate will be.
vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate,
momentum,
decayRate);
}
void AdamParameterOptimizer(const VectorPtr vecs[],
real beta1,
real beta2,
real beta1_power,
real beta2_power,
real epsilon,
real learningRate) {
Vector* m = vecs[PARAMETER_MOMENTUM].get();
Vector* g = vecs[PARAMETER_GRADIENT].get();
Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
Vector* theta = vecs[PARAMETER_VALUE].get();
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
m->add(*g, beta1, 1 - beta1);
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
g->square2();
v->add(*g, beta2, 1 - beta2);
// tmp = m_t / ( \sqrt{v_t} + \epsilon )
// \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
g->sqrt2(*v);
g->dotDiv(*m, *g, 0., epsilon);
real alpha =
learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
theta->add(*theta, 1.0, *g, -alpha);
}
void AdamaxParameterOptimizer(
const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
Vector* m = vecs[PARAMETER_MOMENTUM].get();
Vector* g = vecs[PARAMETER_GRADIENT].get();
Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
Vector* theta = vecs[PARAMETER_VALUE].get();
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
m->add(*g, beta1, 1 - beta1);
// u_t = max(\beta_2*u_{t-1}, abs(g_t))
u->mulScalar(beta2);
g->abs2();
u->max2(*u, *g);
// \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
g->dotDiv(*m, *u);
real learningRate = alpha / (1 - std::pow(beta1, step));
theta->add(*theta, 1.0, *g, -learningRate);
}

@ -0,0 +1,46 @@
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
// Performance Check
#ifdef PADDLE_DISABLE_TIMER
#define EXPRESSION_PERFORMANCE(expression) expression;
#else
#include "paddle/utils/Stat.h"
using namespace paddle; // NOLINT
#define EXPRESSION_PERFORMANCE(expression) \
do { \
char expr[30]; \
strncpy(expr, #expression, 30); \
if (expr[29] != '\0') { \
expr[27] = '.'; \
expr[28] = '.'; \
expr[29] = '\0'; \
} \
expression; \
for (int i = 0; i < 20; i++) { \
REGISTER_TIMER(expr); \
expression; \
} \
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
<< *globalStat.getStat(expr); \
globalStat.reset(); \
} while (0)
#endif

@ -37,13 +37,13 @@ TEST(BaseMatrix, void) {
};
compare(&BaseMatrix::neg);
compare(&BaseMatrix::exp);
compare(&BaseMatrix::log);
compare(&BaseMatrix::sqrt);
compare(&BaseMatrix::square);
compare(&BaseMatrix::reciprocal);
compare(&BaseMatrix::abs);
compare(&BaseMatrix::sign);
compare(&BaseMatrix::exp2);
compare(&BaseMatrix::log2);
compare(&BaseMatrix::sqrt2);
compare(&BaseMatrix::square2);
compare(&BaseMatrix::reciprocal2);
compare(&BaseMatrix::abs2);
compare(&BaseMatrix::sign2);
compare(&BaseMatrix::zero);
compare(&BaseMatrix::one);
}
@ -59,7 +59,7 @@ TEST(BaseMatrix, real) {
test.cmpWithoutArg<0>(f, height, width);
};
compare(&BaseMatrix::pow);
compare(&BaseMatrix::pow2);
compare(&BaseMatrix::subScalar);
compare(&BaseMatrix::mulScalar);
compare(&BaseMatrix::divScalar);
@ -88,21 +88,21 @@ TEST(BaseMatrix, BaseMatrix) {
compare(&BaseMatrix::softreluDerivative);
compare(&BaseMatrix::brelu);
compare(&BaseMatrix::breluDerivative);
compare(&BaseMatrix::square);
compare(&BaseMatrix::square2);
compare(&BaseMatrix::squareDerivative);
compare(&BaseMatrix::tanh);
compare(&BaseMatrix::tanhDerivative);
compare(&BaseMatrix::reciprocal);
compare(&BaseMatrix::reciprocal2);
compare(&BaseMatrix::reciprocalDerivative);
compare(&BaseMatrix::abs);
compare(&BaseMatrix::abs2);
compare(&BaseMatrix::absDerivative);
compare(&BaseMatrix::sigmoid);
compare(&BaseMatrix::sigmoidDerivative);
compare(&BaseMatrix::expDerivative);
compare(&BaseMatrix::sign);
compare(&BaseMatrix::exp);
compare(&BaseMatrix::log);
compare(&BaseMatrix::sqrt);
compare(&BaseMatrix::sign2);
compare(&BaseMatrix::exp2);
compare(&BaseMatrix::log2);
compare(&BaseMatrix::sqrt2);
compare(&BaseMatrix::dotMul);
compare(&BaseMatrix::dotMulSquare);
compare(&BaseMatrix::dotSquareMul);
@ -143,7 +143,7 @@ TEST(BaseMatrix, BaseMatrix_real) {
compare(&BaseMatrix::addBias);
compare(&BaseMatrix::add);
compare(&BaseMatrix::sub);
compare(&BaseMatrix::pow);
compare(&BaseMatrix::pow2);
compare(&BaseMatrix::addScalar);
compare(&BaseMatrix::subScalar);
compare(&BaseMatrix::mulScalar);
@ -176,7 +176,7 @@ TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
compare(&BaseMatrix::logisticRegressionLoss);
compare(&BaseMatrix::logisticRegressionLossBp);
compare(&BaseMatrix::biggerThan);
compare(&BaseMatrix::max);
compare(&BaseMatrix::max2);
compare(&BaseMatrix::dotMulSquare);
compare(&BaseMatrix::dotSquareSquare);
}

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save