commit
6216b59597
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,211 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief The tensor evaluator classes.
|
||||
*/
|
||||
template <typename Derived, class T>
|
||||
class TensorApply {
|
||||
public:
|
||||
explicit INLINE TensorApply(const Derived& p)
|
||||
: data_(p.data_),
|
||||
stride_(p.stride_),
|
||||
height_(p.height_),
|
||||
width_(p.width_),
|
||||
useGpu_(p.useGpu_) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
|
||||
INLINE T apply(int index) const { return data_[index]; }
|
||||
INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
|
||||
INLINE T& applyRef(int index) { return data_[index]; }
|
||||
|
||||
INLINE size_t getWidth() const { return width_; }
|
||||
INLINE size_t getHeight() const { return height_; }
|
||||
INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
|
||||
INLINE bool useGpu() const { return useGpu_; }
|
||||
|
||||
T* data_;
|
||||
size_t stride_;
|
||||
size_t height_;
|
||||
size_t width_;
|
||||
bool useGpu_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The tensor evaluator classes.
|
||||
* evaluator for rvalues
|
||||
*/
|
||||
template <typename Derived, class T>
|
||||
class TensorApply<const Derived, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(const Derived& p)
|
||||
: data_(p.data_),
|
||||
stride_(p.stride_),
|
||||
height_(p.height_),
|
||||
width_(p.width_),
|
||||
useGpu_(p.useGpu_) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
|
||||
INLINE T apply(int index) const { return data_[index]; }
|
||||
|
||||
INLINE size_t getWidth() const { return width_; }
|
||||
INLINE size_t getHeight() const { return height_; }
|
||||
INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
|
||||
INLINE bool useGpu() const { return useGpu_; }
|
||||
|
||||
const T* data_;
|
||||
size_t stride_;
|
||||
size_t height_;
|
||||
size_t width_;
|
||||
bool useGpu_;
|
||||
};
|
||||
|
||||
template <typename Derived, class T>
|
||||
class TensorApply<const TensorExpression<Derived, T>, T> {
|
||||
public:
|
||||
explicit TensorApply(const TensorExpression<Derived, T>& expr)
|
||||
: expr_(expr.derived()) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
|
||||
INLINE T apply(int index) const { return expr_.apply(index); }
|
||||
|
||||
INLINE size_t getWidth() const { return expr_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return expr_.getHeight(); }
|
||||
INLINE bool isContiguous() const { return expr_.isContiguous(); }
|
||||
INLINE bool useGpu() const { return expr_.useGpu(); }
|
||||
|
||||
TensorApply<const Derived, T> expr_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The unary expression evaluator classes.
|
||||
*/
|
||||
template <class OP, typename ArgType, class T>
|
||||
class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
|
||||
: op_(expr.op_), expr_(expr.expr_) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
|
||||
INLINE T apply(int index) const { return op_(expr_.apply(index)); }
|
||||
|
||||
INLINE size_t getWidth() const { return expr_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return expr_.getHeight(); }
|
||||
INLINE bool isContiguous() const { return expr_.isContiguous(); }
|
||||
INLINE bool useGpu() const { return expr_.useGpu(); }
|
||||
|
||||
const OP op_;
|
||||
TensorApply<ArgType, T> expr_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The binary expression evaluator classes.
|
||||
*/
|
||||
template <class OP, typename LhsType, typename RhsType, class T>
|
||||
class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(
|
||||
const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
|
||||
: op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
||||
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
||||
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE T apply(int i, int j) const {
|
||||
return op_(lhs_.apply(i, j), rhs_.apply(i, j));
|
||||
}
|
||||
INLINE T apply(int index) const {
|
||||
return op_(lhs_.apply(index), rhs_.apply(index));
|
||||
}
|
||||
|
||||
INLINE size_t getWidth() const { return lhs_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return rhs_.getHeight(); }
|
||||
INLINE bool isContiguous() const {
|
||||
return lhs_.isContiguous() && rhs_.isContiguous();
|
||||
}
|
||||
INLINE bool useGpu() const { return lhs_.useGpu(); }
|
||||
|
||||
const OP op_;
|
||||
TensorApply<LhsType, T> lhs_;
|
||||
TensorApply<RhsType, T> rhs_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The ternary expression evaluator classes.
|
||||
*/
|
||||
template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
|
||||
class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(
|
||||
const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
|
||||
: expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
|
||||
CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
|
||||
CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
|
||||
CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
|
||||
CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
|
||||
CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE T apply(int i, int j) const {
|
||||
return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
|
||||
}
|
||||
INLINE T apply(int index) const {
|
||||
return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
|
||||
}
|
||||
|
||||
INLINE size_t getWidth() const { return expr1_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return expr1_.getHeight(); }
|
||||
INLINE bool isContiguous() const {
|
||||
return expr1_.isContiguous() && expr2_.isContiguous() &&
|
||||
expr3_.isContiguous();
|
||||
}
|
||||
INLINE bool useGpu() const { return expr1_.useGpu(); }
|
||||
|
||||
TensorApply<ArgType1, T> expr1_;
|
||||
TensorApply<ArgType2, T> expr2_;
|
||||
TensorApply<ArgType3, T> expr3_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The const expression evaluator classes.
|
||||
*/
|
||||
template <class OP, typename ArgType, class T>
|
||||
class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
|
||||
: op_(expr.op_), expr_(expr.expr_) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return op_(i, j); }
|
||||
INLINE T apply(int index) const { return op_(index); }
|
||||
|
||||
INLINE size_t getWidth() const { return expr_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return expr_.getHeight(); }
|
||||
INLINE bool isContiguous() const { return true; }
|
||||
INLINE bool useGpu() const { return expr_.useGpu(); }
|
||||
|
||||
const OP op_;
|
||||
TensorApply<ArgType, T> expr_;
|
||||
};
|
||||
|
||||
} // namespace paddle
|
||||
@ -0,0 +1,158 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include "paddle/utils/Logging.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief Tensor Assign Expression(return by lazyAssign,
|
||||
* and evaluated by AssignEvaluate)
|
||||
*/
|
||||
template <typename LhsType, typename RhsType, class T>
|
||||
class TensorAssignOp {
|
||||
public:
|
||||
explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
|
||||
: lhs_(lhs), rhs_(rhs) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
||||
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
||||
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE void apply(const int i, const int j) {
|
||||
lhs_.applyRef(i, j) = rhs_.apply(i, j);
|
||||
}
|
||||
INLINE void apply(const int index) {
|
||||
lhs_.applyRef(index) = rhs_.apply(index);
|
||||
}
|
||||
|
||||
INLINE size_t getWidth() const { return lhs_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return rhs_.getHeight(); }
|
||||
INLINE bool isContiguous() const {
|
||||
return lhs_.isContiguous() && rhs_.isContiguous();
|
||||
}
|
||||
INLINE bool useGpu() const { return lhs_.useGpu(); }
|
||||
|
||||
private:
|
||||
TensorApply<LhsType, T> lhs_;
|
||||
TensorApply<const RhsType, T> rhs_;
|
||||
};
|
||||
|
||||
template <typename Assign, typename... AssignOp>
|
||||
void AssignCpuEvaluate(int height,
|
||||
int width,
|
||||
bool isContiguous,
|
||||
Assign&& assign,
|
||||
AssignOp&&... args) {
|
||||
if (isContiguous) {
|
||||
int size = height * width;
|
||||
for (int index = 0; index < size; index++) {
|
||||
assign.apply(index);
|
||||
__attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < height; i++) {
|
||||
for (int j = 0; j < width; j++) {
|
||||
assign.apply(i, j);
|
||||
__attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __NVCC__
|
||||
template <typename Assign, typename... AssignOp>
|
||||
__global__ void AssignGpuEvaluate1(const int border,
|
||||
Assign assign,
|
||||
AssignOp... args) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < border) {
|
||||
assign.apply(idx);
|
||||
__attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Assign, typename... AssignOp>
|
||||
__global__ void AssignGpuEvaluate2(const int height,
|
||||
const int width,
|
||||
Assign assign,
|
||||
AssignOp... args) {
|
||||
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
|
||||
for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
|
||||
assign.apply(i, j);
|
||||
__attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \brief Evaluate one or more TensorAssignOp objects.
|
||||
*
|
||||
* \note At least one assignment expression is required
|
||||
*/
|
||||
template <typename Assign, typename... AssignOp>
|
||||
void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
|
||||
const bool useGpu_ = assign.useGpu();
|
||||
bool isContiguous_ = assign.isContiguous();
|
||||
const size_t height = assign.getHeight();
|
||||
const size_t width = assign.getWidth();
|
||||
|
||||
const int packSize = sizeof...(args);
|
||||
const bool packUseGpu[] = {((args)).useGpu()...};
|
||||
const bool packIsContiguous[] = {((args)).isContiguous()...};
|
||||
const size_t packHeight[] = {((args)).getHeight()...};
|
||||
const size_t packWidth[] = {((args)).getWidth()...};
|
||||
|
||||
for (int i = 0; i < packSize; i++) {
|
||||
CHECK_EQ(useGpu_, packUseGpu[i]);
|
||||
CHECK_EQ(height, packHeight[i]);
|
||||
CHECK_EQ(width, packWidth[i]);
|
||||
isContiguous_ = isContiguous_ && packIsContiguous[i];
|
||||
}
|
||||
|
||||
if (useGpu_) {
|
||||
#ifdef __NVCC__
|
||||
if (isContiguous_) {
|
||||
int size = height * width;
|
||||
int blockSize = size <= 1024 ? size : 1024;
|
||||
int gridSize = (size + 1024 - 1) / 1024;
|
||||
AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
|
||||
size, assign, args...);
|
||||
} else {
|
||||
int blockSizeY = std::min(32, (int)height);
|
||||
int blockSizeX = (32 / blockSizeY) * 32;
|
||||
int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
|
||||
int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
|
||||
dim3 threads(blockSizeX, blockSizeY);
|
||||
dim3 grid(gridSizeX, gridSizeY);
|
||||
AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
|
||||
height, width, assign, args...);
|
||||
}
|
||||
|
||||
CHECK_SYNC("AssignEvaluate failed");
|
||||
#endif
|
||||
} else {
|
||||
AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace paddle
|
||||
@ -0,0 +1,109 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include "paddle/utils/Logging.h"
|
||||
#include "hl_base.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief The tensor cpu evaluate api.
|
||||
*/
|
||||
template <class T, typename LeftType, typename RightType>
|
||||
inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
|
||||
TensorApply<LeftType, T> lhs_(lhs);
|
||||
TensorApply<const RightType, T> rhs_(rhs);
|
||||
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
||||
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
||||
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
||||
|
||||
int height = lhs_.getHeight();
|
||||
int width = lhs_.getWidth();
|
||||
if (lhs_.isContiguous() && rhs_.isContiguous()) {
|
||||
int size = height * width;
|
||||
for (int index = 0; index < size; index++) {
|
||||
lhs_.applyRef(index) = rhs_.apply(index);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < height; i++) {
|
||||
for (int j = 0; j < width; j++) {
|
||||
lhs_.applyRef(i, j) = rhs_.apply(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __NVCC__
|
||||
template <typename LeftType, typename RightType>
|
||||
__global__ void TensorElementWiseOp(LeftType lhs,
|
||||
RightType rhs,
|
||||
const int border) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < border) {
|
||||
lhs.applyRef(idx) = rhs.apply(idx);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LeftType, typename RightType>
|
||||
__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
|
||||
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
|
||||
for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
|
||||
lhs.applyRef(i, j) = rhs.apply(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief The tensor gpu evaluate api.
|
||||
*/
|
||||
template <class T, typename LeftType, typename RightType>
|
||||
inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
|
||||
TensorApply<LeftType, T> lhs_(lhs);
|
||||
TensorApply<const RightType, T> rhs_(rhs);
|
||||
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
||||
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
||||
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
||||
|
||||
int dimM = lhs_.getHeight();
|
||||
int dimN = lhs_.getWidth();
|
||||
|
||||
if (lhs_.isContiguous() && rhs_.isContiguous()) {
|
||||
int size = dimM * dimN;
|
||||
int blockSize = size <= 1024 ? size : 1024;
|
||||
int gridSize = (size + 1024 - 1) / 1024;
|
||||
TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
|
||||
lhs_, rhs_, size);
|
||||
} else {
|
||||
int blockSizeY = std::min(32, dimM);
|
||||
int blockSizeX = (32 / blockSizeY) * 32;
|
||||
int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
|
||||
int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
|
||||
dim3 threads(blockSizeX, blockSizeY);
|
||||
dim3 grid(gridSizeX, gridSizeY);
|
||||
TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
|
||||
}
|
||||
|
||||
CHECK_SYNC("TensorGpuApply failed");
|
||||
}
|
||||
#else
|
||||
template <class T, typename LeftType, typename RightType>
|
||||
inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
|
||||
#endif
|
||||
|
||||
} // namespace paddle
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,122 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/utils/Logging.h"
|
||||
#include "BaseMatrix.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief Sparse Momentum optimizer.
|
||||
*/
|
||||
extern void sparseMomentumApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& momU,
|
||||
BaseMatrix& momV,
|
||||
real alpha,
|
||||
real beta,
|
||||
real gamma,
|
||||
real tau,
|
||||
real learningRate);
|
||||
|
||||
/**
|
||||
* \brief AdaDelta optimizer.
|
||||
*/
|
||||
extern void adadeltaApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& sum,
|
||||
BaseMatrix& sum1,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& lr,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate);
|
||||
|
||||
/**
|
||||
* \brief AdaGrad optimizer.
|
||||
*/
|
||||
extern void adagradApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& sum,
|
||||
BaseMatrix& sum1,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& lr,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate);
|
||||
|
||||
/**
|
||||
* \brief RMSProp optimizer.
|
||||
*/
|
||||
extern void rmspropApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& g,
|
||||
BaseMatrix& f,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& lr,
|
||||
real accumulatedRou,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate,
|
||||
bool firstTime);
|
||||
|
||||
/**
|
||||
* \brief Decayed AdaGrad optimizer.
|
||||
*/
|
||||
extern void decayedAdagradApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& accum,
|
||||
BaseMatrix& lr,
|
||||
real accumulatedRou,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate,
|
||||
bool firstTime);
|
||||
|
||||
/**
|
||||
* \brief Adam optimizer.
|
||||
*/
|
||||
extern void adamApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& v,
|
||||
real beta1,
|
||||
real beta2,
|
||||
real beta1_power,
|
||||
real beta2_power,
|
||||
real epsilon,
|
||||
real learningRate);
|
||||
|
||||
/**
|
||||
* \brief AdaMax optimizer.
|
||||
*/
|
||||
extern void adamaxApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& mom, // firse moment
|
||||
BaseMatrix& u, // weighted infinity norm
|
||||
real beta1,
|
||||
real beta2,
|
||||
int64_t step,
|
||||
real alpha);
|
||||
} // namespace paddle
|
||||
@ -0,0 +1,201 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/utils/GlobalConstants.h"
|
||||
#include "paddle/math/Vector.h"
|
||||
|
||||
using namespace paddle; // NOLINT
|
||||
|
||||
void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
|
||||
real alpha,
|
||||
real beta,
|
||||
real gamma,
|
||||
real tau,
|
||||
real learningRate) {
|
||||
vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
|
||||
-alpha * gamma * learningRate);
|
||||
vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
|
||||
tau * alpha * gamma * learningRate);
|
||||
vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
|
||||
tau / beta + 1.0 / alpha,
|
||||
*vecs[PARAMETER_MOMENTUM_VT],
|
||||
1.0 / beta);
|
||||
}
|
||||
|
||||
void AdagradParameterOptimizer(const VectorPtr vecs[],
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate) {
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
|
||||
1.0f);
|
||||
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
|
||||
*vecs[PARAMETER_GRADIENT_SQURESUM1]);
|
||||
vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
|
||||
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
|
||||
|
||||
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_MOMENTUM],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
learningRate,
|
||||
momentum,
|
||||
decayRate);
|
||||
}
|
||||
|
||||
void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate) {
|
||||
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
|
||||
*vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
|
||||
|
||||
// learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
|
||||
vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
|
||||
*vecs[PARAMETER_GRADIENT_SQURESUM],
|
||||
epsilon,
|
||||
epsilon);
|
||||
vecs[PARAMETER_LEARNING_RATE]->sqrt2();
|
||||
|
||||
// E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
|
||||
*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
rou,
|
||||
1.0f - rou);
|
||||
|
||||
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_MOMENTUM],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
learningRate,
|
||||
momentum,
|
||||
decayRate);
|
||||
}
|
||||
|
||||
void RMSPropParameterOptimizer(const VectorPtr vecs[],
|
||||
real accumulatedRou,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate,
|
||||
bool firstTime) {
|
||||
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
|
||||
// For the first time update, make the sum be the current square
|
||||
// so that the initial estimation of E(g_t^2) will not be too small.
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
|
||||
*vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
|
||||
|
||||
// E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
|
||||
*vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
|
||||
|
||||
// learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
|
||||
// Basiclly if the sign of the gradient changes more often,
|
||||
// the learning rate will be decreased.
|
||||
vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
|
||||
vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
|
||||
-1.0f);
|
||||
vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
|
||||
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
|
||||
|
||||
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_MOMENTUM],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
learningRate,
|
||||
momentum,
|
||||
decayRate);
|
||||
}
|
||||
|
||||
void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
|
||||
real accumulatedRou,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate,
|
||||
bool firstTime) {
|
||||
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
|
||||
// For the first time update, make the sum be the current square
|
||||
// so that the initial estimation of E(g_t^2) will not be too small.
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
|
||||
*vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
|
||||
|
||||
// learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
|
||||
// Basiclly if the bigger the magnitude gradient is,
|
||||
// the smaller the learning rate will be.
|
||||
vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
|
||||
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
|
||||
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
|
||||
|
||||
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_MOMENTUM],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
learningRate,
|
||||
momentum,
|
||||
decayRate);
|
||||
}
|
||||
|
||||
void AdamParameterOptimizer(const VectorPtr vecs[],
|
||||
real beta1,
|
||||
real beta2,
|
||||
real beta1_power,
|
||||
real beta2_power,
|
||||
real epsilon,
|
||||
real learningRate) {
|
||||
Vector* m = vecs[PARAMETER_MOMENTUM].get();
|
||||
Vector* g = vecs[PARAMETER_GRADIENT].get();
|
||||
Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
|
||||
Vector* theta = vecs[PARAMETER_VALUE].get();
|
||||
|
||||
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
|
||||
m->add(*g, beta1, 1 - beta1);
|
||||
|
||||
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
|
||||
g->square2();
|
||||
v->add(*g, beta2, 1 - beta2);
|
||||
|
||||
// tmp = m_t / ( \sqrt{v_t} + \epsilon )
|
||||
// \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
|
||||
g->sqrt2(*v);
|
||||
g->dotDiv(*m, *g, 0., epsilon);
|
||||
real alpha =
|
||||
learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
|
||||
theta->add(*theta, 1.0, *g, -alpha);
|
||||
}
|
||||
|
||||
void AdamaxParameterOptimizer(
|
||||
const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
|
||||
Vector* m = vecs[PARAMETER_MOMENTUM].get();
|
||||
Vector* g = vecs[PARAMETER_GRADIENT].get();
|
||||
Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
|
||||
Vector* theta = vecs[PARAMETER_VALUE].get();
|
||||
|
||||
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
|
||||
m->add(*g, beta1, 1 - beta1);
|
||||
|
||||
// u_t = max(\beta_2*u_{t-1}, abs(g_t))
|
||||
u->mulScalar(beta2);
|
||||
g->abs2();
|
||||
u->max2(*u, *g);
|
||||
|
||||
// \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
|
||||
g->dotDiv(*m, *u);
|
||||
real learningRate = alpha / (1 - std::pow(beta1, step));
|
||||
theta->add(*theta, 1.0, *g, -learningRate);
|
||||
}
|
||||
@ -0,0 +1,46 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
// Performance Check
|
||||
#ifdef PADDLE_DISABLE_TIMER
|
||||
|
||||
#define EXPRESSION_PERFORMANCE(expression) expression;
|
||||
|
||||
#else
|
||||
|
||||
#include "paddle/utils/Stat.h"
|
||||
using namespace paddle; // NOLINT
|
||||
|
||||
#define EXPRESSION_PERFORMANCE(expression) \
|
||||
do { \
|
||||
char expr[30]; \
|
||||
strncpy(expr, #expression, 30); \
|
||||
if (expr[29] != '\0') { \
|
||||
expr[27] = '.'; \
|
||||
expr[28] = '.'; \
|
||||
expr[29] = '\0'; \
|
||||
} \
|
||||
expression; \
|
||||
for (int i = 0; i < 20; i++) { \
|
||||
REGISTER_TIMER(expr); \
|
||||
expression; \
|
||||
} \
|
||||
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
|
||||
<< *globalStat.getStat(expr); \
|
||||
globalStat.reset(); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue