commit
82774dbbd3
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,211 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief The tensor evaluator classes.
|
||||
*/
|
||||
template <typename Derived, class T>
|
||||
class TensorApply {
|
||||
public:
|
||||
explicit INLINE TensorApply(const Derived& p)
|
||||
: data_(p.data_),
|
||||
stride_(p.stride_),
|
||||
height_(p.height_),
|
||||
width_(p.width_),
|
||||
useGpu_(p.useGpu_) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
|
||||
INLINE T apply(int index) const { return data_[index]; }
|
||||
INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
|
||||
INLINE T& applyRef(int index) { return data_[index]; }
|
||||
|
||||
INLINE size_t getWidth() const { return width_; }
|
||||
INLINE size_t getHeight() const { return height_; }
|
||||
INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
|
||||
INLINE bool useGpu() const { return useGpu_; }
|
||||
|
||||
T* data_;
|
||||
size_t stride_;
|
||||
size_t height_;
|
||||
size_t width_;
|
||||
bool useGpu_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The tensor evaluator classes.
|
||||
* evaluator for rvalues
|
||||
*/
|
||||
template <typename Derived, class T>
|
||||
class TensorApply<const Derived, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(const Derived& p)
|
||||
: data_(p.data_),
|
||||
stride_(p.stride_),
|
||||
height_(p.height_),
|
||||
width_(p.width_),
|
||||
useGpu_(p.useGpu_) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
|
||||
INLINE T apply(int index) const { return data_[index]; }
|
||||
|
||||
INLINE size_t getWidth() const { return width_; }
|
||||
INLINE size_t getHeight() const { return height_; }
|
||||
INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
|
||||
INLINE bool useGpu() const { return useGpu_; }
|
||||
|
||||
const T* data_;
|
||||
size_t stride_;
|
||||
size_t height_;
|
||||
size_t width_;
|
||||
bool useGpu_;
|
||||
};
|
||||
|
||||
template <typename Derived, class T>
|
||||
class TensorApply<const TensorExpression<Derived, T>, T> {
|
||||
public:
|
||||
explicit TensorApply(const TensorExpression<Derived, T>& expr)
|
||||
: expr_(expr.derived()) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
|
||||
INLINE T apply(int index) const { return expr_.apply(index); }
|
||||
|
||||
INLINE size_t getWidth() const { return expr_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return expr_.getHeight(); }
|
||||
INLINE bool isContiguous() const { return expr_.isContiguous(); }
|
||||
INLINE bool useGpu() const { return expr_.useGpu(); }
|
||||
|
||||
TensorApply<const Derived, T> expr_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The unary expression evaluator classes.
|
||||
*/
|
||||
template <class OP, typename ArgType, class T>
|
||||
class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
|
||||
: op_(expr.op_), expr_(expr.expr_) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
|
||||
INLINE T apply(int index) const { return op_(expr_.apply(index)); }
|
||||
|
||||
INLINE size_t getWidth() const { return expr_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return expr_.getHeight(); }
|
||||
INLINE bool isContiguous() const { return expr_.isContiguous(); }
|
||||
INLINE bool useGpu() const { return expr_.useGpu(); }
|
||||
|
||||
const OP op_;
|
||||
TensorApply<ArgType, T> expr_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The binary expression evaluator classes.
|
||||
*/
|
||||
template <class OP, typename LhsType, typename RhsType, class T>
|
||||
class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(
|
||||
const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
|
||||
: op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
||||
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
||||
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE T apply(int i, int j) const {
|
||||
return op_(lhs_.apply(i, j), rhs_.apply(i, j));
|
||||
}
|
||||
INLINE T apply(int index) const {
|
||||
return op_(lhs_.apply(index), rhs_.apply(index));
|
||||
}
|
||||
|
||||
INLINE size_t getWidth() const { return lhs_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return rhs_.getHeight(); }
|
||||
INLINE bool isContiguous() const {
|
||||
return lhs_.isContiguous() && rhs_.isContiguous();
|
||||
}
|
||||
INLINE bool useGpu() const { return lhs_.useGpu(); }
|
||||
|
||||
const OP op_;
|
||||
TensorApply<LhsType, T> lhs_;
|
||||
TensorApply<RhsType, T> rhs_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The ternary expression evaluator classes.
|
||||
*/
|
||||
template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
|
||||
class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(
|
||||
const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
|
||||
: expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
|
||||
CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
|
||||
CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
|
||||
CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
|
||||
CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
|
||||
CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE T apply(int i, int j) const {
|
||||
return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
|
||||
}
|
||||
INLINE T apply(int index) const {
|
||||
return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
|
||||
}
|
||||
|
||||
INLINE size_t getWidth() const { return expr1_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return expr1_.getHeight(); }
|
||||
INLINE bool isContiguous() const {
|
||||
return expr1_.isContiguous() && expr2_.isContiguous() &&
|
||||
expr3_.isContiguous();
|
||||
}
|
||||
INLINE bool useGpu() const { return expr1_.useGpu(); }
|
||||
|
||||
TensorApply<ArgType1, T> expr1_;
|
||||
TensorApply<ArgType2, T> expr2_;
|
||||
TensorApply<ArgType3, T> expr3_;
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief The const expression evaluator classes.
|
||||
*/
|
||||
template <class OP, typename ArgType, class T>
|
||||
class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
|
||||
public:
|
||||
explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
|
||||
: op_(expr.op_), expr_(expr.expr_) {}
|
||||
|
||||
INLINE T apply(int i, int j) const { return op_(i, j); }
|
||||
INLINE T apply(int index) const { return op_(index); }
|
||||
|
||||
INLINE size_t getWidth() const { return expr_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return expr_.getHeight(); }
|
||||
INLINE bool isContiguous() const { return true; }
|
||||
INLINE bool useGpu() const { return expr_.useGpu(); }
|
||||
|
||||
const OP op_;
|
||||
TensorApply<ArgType, T> expr_;
|
||||
};
|
||||
|
||||
} // namespace paddle
|
@ -0,0 +1,158 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include "paddle/utils/Logging.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief Tensor Assign Expression(return by lazyAssign,
|
||||
* and evaluated by AssignEvaluate)
|
||||
*/
|
||||
template <typename LhsType, typename RhsType, class T>
|
||||
class TensorAssignOp {
|
||||
public:
|
||||
explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
|
||||
: lhs_(lhs), rhs_(rhs) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
||||
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
||||
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE void apply(const int i, const int j) {
|
||||
lhs_.applyRef(i, j) = rhs_.apply(i, j);
|
||||
}
|
||||
INLINE void apply(const int index) {
|
||||
lhs_.applyRef(index) = rhs_.apply(index);
|
||||
}
|
||||
|
||||
INLINE size_t getWidth() const { return lhs_.getWidth(); }
|
||||
INLINE size_t getHeight() const { return rhs_.getHeight(); }
|
||||
INLINE bool isContiguous() const {
|
||||
return lhs_.isContiguous() && rhs_.isContiguous();
|
||||
}
|
||||
INLINE bool useGpu() const { return lhs_.useGpu(); }
|
||||
|
||||
private:
|
||||
TensorApply<LhsType, T> lhs_;
|
||||
TensorApply<const RhsType, T> rhs_;
|
||||
};
|
||||
|
||||
template <typename Assign, typename... AssignOp>
|
||||
void AssignCpuEvaluate(int height,
|
||||
int width,
|
||||
bool isContiguous,
|
||||
Assign&& assign,
|
||||
AssignOp&&... args) {
|
||||
if (isContiguous) {
|
||||
int size = height * width;
|
||||
for (int index = 0; index < size; index++) {
|
||||
assign.apply(index);
|
||||
__attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < height; i++) {
|
||||
for (int j = 0; j < width; j++) {
|
||||
assign.apply(i, j);
|
||||
__attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __NVCC__
|
||||
template <typename Assign, typename... AssignOp>
|
||||
__global__ void AssignGpuEvaluate1(const int border,
|
||||
Assign assign,
|
||||
AssignOp... args) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < border) {
|
||||
assign.apply(idx);
|
||||
__attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Assign, typename... AssignOp>
|
||||
__global__ void AssignGpuEvaluate2(const int height,
|
||||
const int width,
|
||||
Assign assign,
|
||||
AssignOp... args) {
|
||||
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
|
||||
for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
|
||||
assign.apply(i, j);
|
||||
__attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \brief Evaluate one or more TensorAssignOp objects.
|
||||
*
|
||||
* \note At least one assignment expression is required
|
||||
*/
|
||||
template <typename Assign, typename... AssignOp>
|
||||
void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
|
||||
const bool useGpu_ = assign.useGpu();
|
||||
bool isContiguous_ = assign.isContiguous();
|
||||
const size_t height = assign.getHeight();
|
||||
const size_t width = assign.getWidth();
|
||||
|
||||
const int packSize = sizeof...(args);
|
||||
const bool packUseGpu[] = {((args)).useGpu()...};
|
||||
const bool packIsContiguous[] = {((args)).isContiguous()...};
|
||||
const size_t packHeight[] = {((args)).getHeight()...};
|
||||
const size_t packWidth[] = {((args)).getWidth()...};
|
||||
|
||||
for (int i = 0; i < packSize; i++) {
|
||||
CHECK_EQ(useGpu_, packUseGpu[i]);
|
||||
CHECK_EQ(height, packHeight[i]);
|
||||
CHECK_EQ(width, packWidth[i]);
|
||||
isContiguous_ = isContiguous_ && packIsContiguous[i];
|
||||
}
|
||||
|
||||
if (useGpu_) {
|
||||
#ifdef __NVCC__
|
||||
if (isContiguous_) {
|
||||
int size = height * width;
|
||||
int blockSize = size <= 1024 ? size : 1024;
|
||||
int gridSize = (size + 1024 - 1) / 1024;
|
||||
AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
|
||||
size, assign, args...);
|
||||
} else {
|
||||
int blockSizeY = std::min(32, (int)height);
|
||||
int blockSizeX = (32 / blockSizeY) * 32;
|
||||
int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
|
||||
int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
|
||||
dim3 threads(blockSizeX, blockSizeY);
|
||||
dim3 grid(gridSizeX, gridSizeY);
|
||||
AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
|
||||
height, width, assign, args...);
|
||||
}
|
||||
|
||||
CHECK_SYNC("AssignEvaluate failed");
|
||||
#endif
|
||||
} else {
|
||||
AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace paddle
|
@ -0,0 +1,109 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include "paddle/utils/Logging.h"
|
||||
#include "hl_base.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief The tensor cpu evaluate api.
|
||||
*/
|
||||
template <class T, typename LeftType, typename RightType>
|
||||
inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
|
||||
TensorApply<LeftType, T> lhs_(lhs);
|
||||
TensorApply<const RightType, T> rhs_(rhs);
|
||||
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
||||
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
||||
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
||||
|
||||
int height = lhs_.getHeight();
|
||||
int width = lhs_.getWidth();
|
||||
if (lhs_.isContiguous() && rhs_.isContiguous()) {
|
||||
int size = height * width;
|
||||
for (int index = 0; index < size; index++) {
|
||||
lhs_.applyRef(index) = rhs_.apply(index);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < height; i++) {
|
||||
for (int j = 0; j < width; j++) {
|
||||
lhs_.applyRef(i, j) = rhs_.apply(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __NVCC__
|
||||
template <typename LeftType, typename RightType>
|
||||
__global__ void TensorElementWiseOp(LeftType lhs,
|
||||
RightType rhs,
|
||||
const int border) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < border) {
|
||||
lhs.applyRef(idx) = rhs.apply(idx);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LeftType, typename RightType>
|
||||
__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
|
||||
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
|
||||
for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
|
||||
lhs.applyRef(i, j) = rhs.apply(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief The tensor gpu evaluate api.
|
||||
*/
|
||||
template <class T, typename LeftType, typename RightType>
|
||||
inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
|
||||
TensorApply<LeftType, T> lhs_(lhs);
|
||||
TensorApply<const RightType, T> rhs_(rhs);
|
||||
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
|
||||
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
|
||||
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
|
||||
|
||||
int dimM = lhs_.getHeight();
|
||||
int dimN = lhs_.getWidth();
|
||||
|
||||
if (lhs_.isContiguous() && rhs_.isContiguous()) {
|
||||
int size = dimM * dimN;
|
||||
int blockSize = size <= 1024 ? size : 1024;
|
||||
int gridSize = (size + 1024 - 1) / 1024;
|
||||
TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
|
||||
lhs_, rhs_, size);
|
||||
} else {
|
||||
int blockSizeY = std::min(32, dimM);
|
||||
int blockSizeX = (32 / blockSizeY) * 32;
|
||||
int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
|
||||
int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
|
||||
dim3 threads(blockSizeX, blockSizeY);
|
||||
dim3 grid(gridSizeX, gridSizeY);
|
||||
TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
|
||||
}
|
||||
|
||||
CHECK_SYNC("TensorGpuApply failed");
|
||||
}
|
||||
#else
|
||||
template <class T, typename LeftType, typename RightType>
|
||||
inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
|
||||
#endif
|
||||
|
||||
} // namespace paddle
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,122 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/utils/Logging.h"
|
||||
#include "BaseMatrix.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
* \brief Sparse Momentum optimizer.
|
||||
*/
|
||||
extern void sparseMomentumApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& momU,
|
||||
BaseMatrix& momV,
|
||||
real alpha,
|
||||
real beta,
|
||||
real gamma,
|
||||
real tau,
|
||||
real learningRate);
|
||||
|
||||
/**
|
||||
* \brief AdaDelta optimizer.
|
||||
*/
|
||||
extern void adadeltaApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& sum,
|
||||
BaseMatrix& sum1,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& lr,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate);
|
||||
|
||||
/**
|
||||
* \brief AdaGrad optimizer.
|
||||
*/
|
||||
extern void adagradApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& sum,
|
||||
BaseMatrix& sum1,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& lr,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate);
|
||||
|
||||
/**
|
||||
* \brief RMSProp optimizer.
|
||||
*/
|
||||
extern void rmspropApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& g,
|
||||
BaseMatrix& f,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& lr,
|
||||
real accumulatedRou,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate,
|
||||
bool firstTime);
|
||||
|
||||
/**
|
||||
* \brief Decayed AdaGrad optimizer.
|
||||
*/
|
||||
extern void decayedAdagradApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& accum,
|
||||
BaseMatrix& lr,
|
||||
real accumulatedRou,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate,
|
||||
bool firstTime);
|
||||
|
||||
/**
|
||||
* \brief Adam optimizer.
|
||||
*/
|
||||
extern void adamApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& mom,
|
||||
BaseMatrix& v,
|
||||
real beta1,
|
||||
real beta2,
|
||||
real beta1_power,
|
||||
real beta2_power,
|
||||
real epsilon,
|
||||
real learningRate);
|
||||
|
||||
/**
|
||||
* \brief AdaMax optimizer.
|
||||
*/
|
||||
extern void adamaxApply(BaseMatrix& value,
|
||||
BaseMatrix& grad,
|
||||
BaseMatrix& mom, // firse moment
|
||||
BaseMatrix& u, // weighted infinity norm
|
||||
real beta1,
|
||||
real beta2,
|
||||
int64_t step,
|
||||
real alpha);
|
||||
} // namespace paddle
|
@ -0,0 +1,201 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/utils/GlobalConstants.h"
|
||||
#include "paddle/math/Vector.h"
|
||||
|
||||
using namespace paddle; // NOLINT
|
||||
|
||||
void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
|
||||
real alpha,
|
||||
real beta,
|
||||
real gamma,
|
||||
real tau,
|
||||
real learningRate) {
|
||||
vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
|
||||
-alpha * gamma * learningRate);
|
||||
vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
|
||||
tau * alpha * gamma * learningRate);
|
||||
vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
|
||||
tau / beta + 1.0 / alpha,
|
||||
*vecs[PARAMETER_MOMENTUM_VT],
|
||||
1.0 / beta);
|
||||
}
|
||||
|
||||
void AdagradParameterOptimizer(const VectorPtr vecs[],
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate) {
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
|
||||
1.0f);
|
||||
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
|
||||
*vecs[PARAMETER_GRADIENT_SQURESUM1]);
|
||||
vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
|
||||
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
|
||||
|
||||
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_MOMENTUM],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
learningRate,
|
||||
momentum,
|
||||
decayRate);
|
||||
}
|
||||
|
||||
void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate) {
|
||||
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
|
||||
*vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
|
||||
|
||||
// learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
|
||||
vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
|
||||
*vecs[PARAMETER_GRADIENT_SQURESUM],
|
||||
epsilon,
|
||||
epsilon);
|
||||
vecs[PARAMETER_LEARNING_RATE]->sqrt2();
|
||||
|
||||
// E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
|
||||
*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
rou,
|
||||
1.0f - rou);
|
||||
|
||||
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_MOMENTUM],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
learningRate,
|
||||
momentum,
|
||||
decayRate);
|
||||
}
|
||||
|
||||
void RMSPropParameterOptimizer(const VectorPtr vecs[],
|
||||
real accumulatedRou,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate,
|
||||
bool firstTime) {
|
||||
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
|
||||
// For the first time update, make the sum be the current square
|
||||
// so that the initial estimation of E(g_t^2) will not be too small.
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
|
||||
*vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
|
||||
|
||||
// E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
|
||||
*vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
|
||||
|
||||
// learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
|
||||
// Basiclly if the sign of the gradient changes more often,
|
||||
// the learning rate will be decreased.
|
||||
vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
|
||||
vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
|
||||
-1.0f);
|
||||
vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
|
||||
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
|
||||
|
||||
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_MOMENTUM],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
learningRate,
|
||||
momentum,
|
||||
decayRate);
|
||||
}
|
||||
|
||||
void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
|
||||
real accumulatedRou,
|
||||
real rou,
|
||||
real epsilon,
|
||||
real learningRate,
|
||||
real momentum,
|
||||
real decayRate,
|
||||
bool firstTime) {
|
||||
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
|
||||
// For the first time update, make the sum be the current square
|
||||
// so that the initial estimation of E(g_t^2) will not be too small.
|
||||
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
|
||||
*vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
|
||||
|
||||
// learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
|
||||
// Basiclly if the bigger the magnitude gradient is,
|
||||
// the smaller the learning rate will be.
|
||||
vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
|
||||
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
|
||||
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
|
||||
|
||||
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
|
||||
*vecs[PARAMETER_MOMENTUM],
|
||||
*vecs[PARAMETER_LEARNING_RATE],
|
||||
learningRate,
|
||||
momentum,
|
||||
decayRate);
|
||||
}
|
||||
|
||||
void AdamParameterOptimizer(const VectorPtr vecs[],
|
||||
real beta1,
|
||||
real beta2,
|
||||
real beta1_power,
|
||||
real beta2_power,
|
||||
real epsilon,
|
||||
real learningRate) {
|
||||
Vector* m = vecs[PARAMETER_MOMENTUM].get();
|
||||
Vector* g = vecs[PARAMETER_GRADIENT].get();
|
||||
Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
|
||||
Vector* theta = vecs[PARAMETER_VALUE].get();
|
||||
|
||||
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
|
||||
m->add(*g, beta1, 1 - beta1);
|
||||
|
||||
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
|
||||
g->square2();
|
||||
v->add(*g, beta2, 1 - beta2);
|
||||
|
||||
// tmp = m_t / ( \sqrt{v_t} + \epsilon )
|
||||
// \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
|
||||
g->sqrt2(*v);
|
||||
g->dotDiv(*m, *g, 0., epsilon);
|
||||
real alpha =
|
||||
learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
|
||||
theta->add(*theta, 1.0, *g, -alpha);
|
||||
}
|
||||
|
||||
void AdamaxParameterOptimizer(
|
||||
const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
|
||||
Vector* m = vecs[PARAMETER_MOMENTUM].get();
|
||||
Vector* g = vecs[PARAMETER_GRADIENT].get();
|
||||
Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
|
||||
Vector* theta = vecs[PARAMETER_VALUE].get();
|
||||
|
||||
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
|
||||
m->add(*g, beta1, 1 - beta1);
|
||||
|
||||
// u_t = max(\beta_2*u_{t-1}, abs(g_t))
|
||||
u->mulScalar(beta2);
|
||||
g->abs2();
|
||||
u->max2(*u, *g);
|
||||
|
||||
// \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
|
||||
g->dotDiv(*m, *u);
|
||||
real learningRate = alpha / (1 - std::pow(beta1, step));
|
||||
theta->add(*theta, 1.0, *g, -learningRate);
|
||||
}
|
@ -0,0 +1,46 @@
|
||||
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
// Performance Check
|
||||
#ifdef PADDLE_DISABLE_TIMER
|
||||
|
||||
#define EXPRESSION_PERFORMANCE(expression) expression;
|
||||
|
||||
#else
|
||||
|
||||
#include "paddle/utils/Stat.h"
|
||||
using namespace paddle; // NOLINT
|
||||
|
||||
#define EXPRESSION_PERFORMANCE(expression) \
|
||||
do { \
|
||||
char expr[30]; \
|
||||
strncpy(expr, #expression, 30); \
|
||||
if (expr[29] != '\0') { \
|
||||
expr[27] = '.'; \
|
||||
expr[28] = '.'; \
|
||||
expr[29] = '\0'; \
|
||||
} \
|
||||
expression; \
|
||||
for (int i = 0; i < 20; i++) { \
|
||||
REGISTER_TIMER(expr); \
|
||||
expression; \
|
||||
} \
|
||||
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
|
||||
<< *globalStat.getStat(expr); \
|
||||
globalStat.reset(); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue