You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
375 lines
14 KiB
375 lines
14 KiB
/**
|
|
* test_TrainingAlgorithm.cpp
|
|
*
|
|
* Author: hedaoyuan (hedaoyuan@baidu.com)
|
|
* Created on: 2016-06-29
|
|
*
|
|
* Copyright (c) Baidu.com, Inc. All Rights Reserved
|
|
*/
|
|
|
|
#include <gtest/gtest.h>
|
|
#include "paddle/utils/Util.h"
|
|
#include "paddle/math/TrainingAlgorithmOp.h"
|
|
#include "OriginalOptimizerApi.h"
|
|
#include "TensorCheck.h"
|
|
|
|
using namespace paddle; // NOLINT
|
|
|
|
#ifndef PADDLE_TYPE_DOUBLE
|
|
P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
|
|
#else
|
|
P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
|
|
#endif
|
|
|
|
class SetMaxDiff {
|
|
public:
|
|
explicit SetMaxDiff(double max_diff) {
|
|
max_diff_ = FLAGS_max_diff;
|
|
FLAGS_max_diff = max_diff;
|
|
}
|
|
~SetMaxDiff() {
|
|
FLAGS_max_diff = max_diff_;
|
|
}
|
|
private:
|
|
double max_diff_;
|
|
};
|
|
|
|
#define COPY_VECTOR_TO_CPU(cpuVec, vector) \
|
|
do {\
|
|
if (vector->useGpu()) {\
|
|
cpuVec = Vector::create(vector->getSize(), false);\
|
|
cpuVec->copyFrom(*vector);\
|
|
} else {\
|
|
cpuVec = vector;\
|
|
}\
|
|
} while (0)
|
|
|
|
int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
|
|
VectorPtr tmp1;
|
|
VectorPtr tmp2;
|
|
COPY_VECTOR_TO_CPU(tmp1, vector1);
|
|
COPY_VECTOR_TO_CPU(tmp2, vector2);
|
|
return VectorCheckErr(*tmp1, *tmp2);
|
|
}
|
|
|
|
typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
|
|
|
|
void testCase(testMatrixFunc matrixFunc) {
|
|
for (auto useGpu : {false, true}) {
|
|
for (auto size : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
|
|
262144, 524288, 1048576, 2097152}) {
|
|
LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
|
|
matrixFunc(size, useGpu);
|
|
}
|
|
}
|
|
}
|
|
|
|
#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
|
|
vec1[type] = Vector::create(size, useGpu); \
|
|
vec2[type] = Vector::create(size, useGpu); \
|
|
vec1[type]->rand(); \
|
|
vec2[type]->copyFrom(*vec1[type]);
|
|
|
|
void testAdagrad(size_t size, bool useGpu) {
|
|
VectorPtr bufs1[NUM_PARAMETER_TYPES];
|
|
VectorPtr bufs2[NUM_PARAMETER_TYPES];
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
|
|
|
|
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real momentum = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
|
|
EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(bufs1,
|
|
epsilon, learningRate, momentum, decayRate));
|
|
|
|
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
|
|
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
|
|
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
|
|
BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
|
|
BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
|
|
BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
|
|
|
|
EXPRESSION_PERFORMANCE(adagradApply(value, grad, mom, accum_buffer, accum, lr,
|
|
epsilon, learningRate, momentum, decayRate));
|
|
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
|
|
bufs2[PARAMETER_GRADIENT_SQURESUM1]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
|
|
bufs2[PARAMETER_LEARNING_RATE]);
|
|
}
|
|
|
|
TEST(Training, Adagrad) {
|
|
testCase(testAdagrad);
|
|
}
|
|
|
|
void testAdaDelta(size_t size, bool useGpu) {
|
|
VectorPtr bufs1[NUM_PARAMETER_TYPES];
|
|
VectorPtr bufs2[NUM_PARAMETER_TYPES];
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
|
|
|
|
real rou = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real momentum = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
|
|
EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(bufs1,
|
|
rou, epsilon, learningRate, momentum, decayRate));
|
|
|
|
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
|
|
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
|
|
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
|
|
BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
|
|
BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
|
|
BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
|
|
|
|
EXPRESSION_PERFORMANCE(adadeltaApply(value, grad, mom, accum, accum_update,
|
|
lr, rou, epsilon, learningRate, momentum, decayRate));
|
|
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
|
|
bufs2[PARAMETER_GRADIENT_SQURESUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
|
|
bufs2[PARAMETER_GRADIENT_SQURESUM1]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
|
|
bufs2[PARAMETER_LEARNING_RATE]);
|
|
}
|
|
|
|
TEST(Training, AdaDelta) {
|
|
testCase(testAdaDelta);
|
|
}
|
|
|
|
template<bool isFirstTime>
|
|
void testRMSProp(size_t size, bool useGpu) {
|
|
VectorPtr bufs1[NUM_PARAMETER_TYPES];
|
|
VectorPtr bufs2[NUM_PARAMETER_TYPES];
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
|
|
|
|
/* make sure 'g - f.square()' greater than 0 */
|
|
bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
|
|
bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
|
|
*bufs1[PARAMETER_GRADIENT_SQURESUM]);
|
|
|
|
real rou = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real momentum = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real accumulatedRou = rou;
|
|
|
|
EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
|
|
accumulatedRou, rou, epsilon, learningRate, momentum, decayRate,
|
|
isFirstTime));
|
|
|
|
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
|
|
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
|
|
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
|
|
BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
|
|
BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
|
|
BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
|
|
|
|
EXPRESSION_PERFORMANCE(rmspropApply(value, grad, mom, sum, sum1, lr,
|
|
accumulatedRou, rou, epsilon, learningRate, momentum, decayRate,
|
|
isFirstTime));
|
|
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
|
|
bufs2[PARAMETER_GRADIENT_SQURESUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
|
|
bufs2[PARAMETER_GRADIENT_SQURESUM1]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
|
|
bufs2[PARAMETER_LEARNING_RATE]);
|
|
}
|
|
|
|
TEST(Training, RMSProp) {
|
|
testCase(testRMSProp<true>);
|
|
testCase(testRMSProp<false>);
|
|
}
|
|
|
|
template<bool isFirstTime>
|
|
void testDecayedAdagrad(size_t size, bool useGpu) {
|
|
VectorPtr bufs1[NUM_PARAMETER_TYPES];
|
|
VectorPtr bufs2[NUM_PARAMETER_TYPES];
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
|
|
|
|
real rou = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real momentum = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real accumulatedRou = rou;
|
|
|
|
if (isFirstTime) {
|
|
bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
|
|
bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
|
|
}
|
|
|
|
EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
|
|
accumulatedRou, rou, epsilon, learningRate, momentum, decayRate,
|
|
isFirstTime));
|
|
|
|
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
|
|
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
|
|
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
|
|
BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
|
|
BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
|
|
|
|
EXPRESSION_PERFORMANCE(decayedAdagradApply(value, grad, mom, sum, lr,
|
|
accumulatedRou, rou, epsilon, learningRate, momentum, decayRate,
|
|
isFirstTime));
|
|
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
|
|
bufs2[PARAMETER_GRADIENT_SQURESUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
|
|
bufs2[PARAMETER_LEARNING_RATE]);
|
|
}
|
|
|
|
TEST(Training, DecayedAdagrad) {
|
|
testCase(testDecayedAdagrad<false>);
|
|
testCase(testDecayedAdagrad<true>);
|
|
}
|
|
|
|
void testAdam(size_t size, bool useGpu) {
|
|
VectorPtr bufs1[NUM_PARAMETER_TYPES];
|
|
VectorPtr bufs2[NUM_PARAMETER_TYPES];
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
|
|
|
|
real beta1 = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real beta2 = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real beta1_power = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real beta2_power = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
|
|
EXPRESSION_PERFORMANCE(AdamParameterOptimizer(bufs1,
|
|
beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
|
|
|
|
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
|
|
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
|
|
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
|
|
BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
|
|
|
|
EXPRESSION_PERFORMANCE(adamApply(value, grad, mom, v,
|
|
beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
|
|
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
|
|
bufs2[PARAMETER_SECOND_MOMENTUM]);
|
|
}
|
|
|
|
TEST(Training, Adam) {
|
|
testCase(testAdam);
|
|
}
|
|
|
|
void testAdamax(size_t size, bool useGpu) {
|
|
VectorPtr bufs1[NUM_PARAMETER_TYPES];
|
|
VectorPtr bufs2[NUM_PARAMETER_TYPES];
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
|
|
|
|
real beta1 = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real beta2 = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real alpha = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
int64_t step = 2;
|
|
|
|
EXPRESSION_PERFORMANCE(AdamaxParameterOptimizer(bufs1,
|
|
beta1, beta2, step, alpha));
|
|
|
|
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
|
|
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
|
|
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
|
|
BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
|
|
|
|
EXPRESSION_PERFORMANCE(adamaxApply(value, grad, mom, u,
|
|
beta1, beta2, step, alpha));
|
|
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
|
|
bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
|
|
}
|
|
|
|
TEST(Training, Adamax) {
|
|
#ifndef PADDLE_TYPE_DOUBLE
|
|
SetMaxDiff diff(1e-4);
|
|
#endif
|
|
testCase(testAdamax);
|
|
}
|
|
|
|
void testSparseMomentum(size_t size, bool useGpu) {
|
|
VectorPtr bufs1[NUM_PARAMETER_TYPES];
|
|
VectorPtr bufs2[NUM_PARAMETER_TYPES];
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
|
|
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
|
|
|
|
real alpha = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real beta = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real gamma = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real tau = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
|
|
|
|
EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(bufs1,
|
|
alpha, beta, gamma, tau, learningRate));
|
|
|
|
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
|
|
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
|
|
BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
|
|
BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
|
|
|
|
EXPRESSION_PERFORMANCE(sparseMomentumApply(value, grad, momU, momV,
|
|
alpha, beta, gamma, tau, learningRate));
|
|
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT],
|
|
bufs2[PARAMETER_MOMENTUM_UT]);
|
|
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT],
|
|
bufs2[PARAMETER_MOMENTUM_VT]);
|
|
}
|
|
|
|
TEST(Training, SparseMomentum) {
|
|
testCase(testSparseMomentum);
|
|
}
|
|
|
|
int main(int argc, char** argv) {
|
|
testing::InitGoogleTest(&argc, argv);
|
|
initMain(argc, argv);
|
|
hl_start();
|
|
hl_init(FLAGS_gpu_id);
|
|
return RUN_ALL_TESTS();
|
|
}
|
|
|