Add recursive mutex and counter for gpu profiler

avx_docs
liaogang 9 years ago
parent 9670b9a1be
commit 8393c19ccf

@ -24,7 +24,7 @@ Why we need profiling?
======================
Since training deep neural network typically take a very long time to get over, performance is gradually becoming
the most important thing in deep learning field. The first step to improve performance is to understand what parts
are slow. No point in improving performance of a region which doesnt take much time!
are slow. There is no point in improving performance of a region which doesnt take much time!
How to do profiling?
@ -59,6 +59,7 @@ above profilers.
The above code snippet includes two methods, you can use any of them to profile the regions of interest.
1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
program crashes when CPU version of PaddlePaddle invokes them.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 476 KiB

@ -70,10 +70,14 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
input->randomizeUniform();
inputGpu->copyFrom(*input);
target->bilinearForward(*input, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
{
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("testBilinearFwdBwd");
target->bilinearForward(*input, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
}
// check
targetCheck->copyFrom(*targetGpu);
@ -104,25 +108,29 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
MatrixCheckErr(*inputGrad, *targetCheckGrad);
}
TEST(Profiler, BilinearFwdBwd) {
TEST(Profiler, testBilinearFwdBwd) {
auto numSamples = 10;
auto channels = 16;
auto imgSize = 64;
{
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("testBilinearFwdBwd",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
REGISTER_GPU_PROFILER("testBilinearFwdBwd");
// Paddle built-in timer
REGISTER_TIMER_INFO("testBilinearFwdBwd",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
}
globalStat.printStatus("testBilinearFwdBwd");
globalStat.printAllStatus();
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("RecursiveProfilingTest",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
return RUN_ALL_TESTS();
}

@ -203,4 +203,22 @@ StatInfo::~StatInfo() {
}
}
static unsigned g_profileCount = 0;
static std::recursive_mutex g_profileMutex;
GpuProfiler::GpuProfiler(std::string statName, std::string info)
: guard_(g_profileMutex) {
if (++g_profileCount == 1) {
LOG(INFO) << "Enable GPU Profiler Stat: ["
<< statName << "] " << info;
hl_profiler_start();
}
}
GpuProfiler::~GpuProfiler() {
if (--g_profileCount == 0) {
hl_profiler_end();
}
}
} // namespace paddle

@ -283,8 +283,10 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
class GpuProfiler final {
public:
GpuProfiler() { hl_profiler_start(); }
~GpuProfiler() { hl_profiler_end(); }
GpuProfiler(std::string statName, std::string info);
~GpuProfiler();
private:
std::lock_guard<std::recursive_mutex> guard_;
};
#ifdef PADDLE_DISABLE_PROFILER
@ -293,10 +295,8 @@ public:
#else
#define REGISTER_GPU_PROFILER(statName, ...) \
LOG(INFO) << "Enable GPU Profiler Stat: [" \
<< statName << "] " << #__VA_ARGS__; \
GpuProfiler __gpuProfiler;
#define REGISTER_GPU_PROFILER(statName, ...) \
GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
#endif // DISABLE_PROFILER

Loading…
Cancel
Save