|
|
|
@ -37,25 +37,27 @@ NVPROF_CONFIG = [
|
|
|
|
|
|
|
|
|
|
@signature_safe_contextmanager
|
|
|
|
|
def cuda_profiler(output_file, output_mode=None, config=None):
|
|
|
|
|
"""The CUDA profiler.
|
|
|
|
|
"""
|
|
|
|
|
The CUDA profiler.
|
|
|
|
|
|
|
|
|
|
This fuctions is used to profile CUDA program by CUDA runtime application
|
|
|
|
|
programming interface. The profiling result will be written into
|
|
|
|
|
`output_file` with Key-Value pair format or Comma separated values format.
|
|
|
|
|
The user can set the output mode by `output_mode` argument and set the
|
|
|
|
|
counters/options for profiling by `config` argument. The default config
|
|
|
|
|
is ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d',
|
|
|
|
|
'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
|
|
|
|
|
Then users can use NVIDIA Visual Profiler
|
|
|
|
|
(https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
|
|
|
|
|
this output file to visualize results.
|
|
|
|
|
`output_file`. The users can set the output mode by `output_mode` argument
|
|
|
|
|
and set the nvidia profiling config by `config` argument.
|
|
|
|
|
|
|
|
|
|
After getting the profiling result file, users can use
|
|
|
|
|
`NVIDIA Visual Profiler <https://developer.nvidia.com/nvidia-visual-profiler>`_
|
|
|
|
|
to load this output file to visualize results.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
output_file (string) : The output file name, the result will be
|
|
|
|
|
output_file (str) : The output file name, the result will be
|
|
|
|
|
written into this file.
|
|
|
|
|
output_mode (string) : The output mode has Key-Value pair format and
|
|
|
|
|
Comma separated values format. It should be 'kvp' or 'csv'.
|
|
|
|
|
config (list of string) : The profiler options and counters can refer
|
|
|
|
|
to "Compute Command Line Profiler User Guide".
|
|
|
|
|
output_mode (str, optional) : The output mode has Key-Value pair format ('kvp')
|
|
|
|
|
and Comma separated values format ('csv', default).
|
|
|
|
|
config (list<str>, optional) : Nvidia profile config. Default config is
|
|
|
|
|
['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d', 'threadblocksize',
|
|
|
|
|
'streamid', 'enableonstart 0', 'conckerneltrace']. For more details, please
|
|
|
|
|
refer to `Compute Command Line Profiler User Guide <https://developer.download.nvidia.cn/compute/DevZone/docs/html/C/doc/Compute_Command_Line_Profiler_User_Guide.pdf>`_ .
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If `output_mode` is not in ['kvp', 'csv'].
|
|
|
|
@ -70,7 +72,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
|
|
|
|
|
|
|
|
|
|
epoc = 8
|
|
|
|
|
dshape = [4, 3, 28, 28]
|
|
|
|
|
data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
|
|
|
|
|
data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
|
|
|
|
|
conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
|
|
|
|
|
|
|
|
|
|
place = fluid.CUDAPlace(0)
|
|
|
|
@ -127,13 +129,14 @@ def reset_profiler():
|
|
|
|
|
def start_profiler(state):
|
|
|
|
|
"""
|
|
|
|
|
Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
|
|
|
|
|
`fluid.profiler.stop_profiler` to insert the code, except the usage of
|
|
|
|
|
`fluid.profiler.profiler` interface.
|
|
|
|
|
`fluid.profiler.stop_profiler` to profile, which is equal to the usage
|
|
|
|
|
of `fluid.profiler.profiler` interface.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
state (string) : The profiling state, which should be 'CPU', 'GPU'
|
|
|
|
|
or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
|
|
|
|
|
GPU as well. 'All' also generates timeline.
|
|
|
|
|
state (str) : The profiling state, which should be one of 'CPU', 'GPU'
|
|
|
|
|
or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
|
|
|
|
|
both CPU and GPU; 'All' means profiling both CPU and GPU, and
|
|
|
|
|
generates timeline as well.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
|
|
|
|
@ -168,21 +171,21 @@ def start_profiler(state):
|
|
|
|
|
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
|
|
|
|
|
"""
|
|
|
|
|
Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
|
|
|
|
|
`fluid.profiler.stop_profiler` to insert the code, except the usage of
|
|
|
|
|
`fluid.profiler.profiler` interface.
|
|
|
|
|
`fluid.profiler.stop_profiler` to profile, which is equal to the usage
|
|
|
|
|
of `fluid.profiler.profiler` interface.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
sorted_key (string) : If None, the profiling results will be printed
|
|
|
|
|
in the order of first end time of events. Otherwise, the profiling
|
|
|
|
|
results will be sorted by the this flag. This flag should be one
|
|
|
|
|
of 'calls', 'total', 'max', 'min' or 'ave'.
|
|
|
|
|
sorted_key (str, optional) : The order of profiling results, which
|
|
|
|
|
should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
|
|
|
|
|
Default is None, means the profiling results will be printed
|
|
|
|
|
in the order of first end time of events.
|
|
|
|
|
The `calls` means sorting by the number of calls.
|
|
|
|
|
The `total` means sorting by the total execution time.
|
|
|
|
|
The `max` means sorting by the maximum execution time.
|
|
|
|
|
The `min` means sorting by the minimum execution time.
|
|
|
|
|
The `ave` means sorting by the average execution time.
|
|
|
|
|
profile_path (string) : If state == 'All', it will write a profile
|
|
|
|
|
proto output file.
|
|
|
|
|
profile_path (str, optional) : If state == 'All', it will generate timeline,
|
|
|
|
|
and write it into `profile_path`. The default profile_path is `/tmp/profile`.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If `sorted_key` is not in
|
|
|
|
@ -223,34 +226,26 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
|
|
|
|
|
|
|
|
|
|
@signature_safe_contextmanager
|
|
|
|
|
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
|
|
|
|
|
"""The profiler interface.
|
|
|
|
|
Different from cuda_profiler, this profiler can be used to profile both CPU
|
|
|
|
|
and GPU program. By default, it records the CPU and GPU operator kernels,
|
|
|
|
|
if you want to profile other program, you can refer the profiling tutorial
|
|
|
|
|
to add more records in C++ code.
|
|
|
|
|
|
|
|
|
|
If the state == 'All', a profile proto file will be written to
|
|
|
|
|
`profile_path`. This file records timeline information during the execution.
|
|
|
|
|
Then users can visualize this file to see the timeline, please refer
|
|
|
|
|
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
|
|
|
|
|
"""
|
|
|
|
|
The profiler interface. Different from `fluid.profiler.cuda_profiler`,
|
|
|
|
|
this profiler can be used to profile both CPU and GPU program.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
state (string) : The profiling state, which should be 'CPU' or 'GPU',
|
|
|
|
|
telling the profiler to use CPU timer or GPU timer for profiling.
|
|
|
|
|
Although users may have already specified the execution place
|
|
|
|
|
(CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
|
|
|
|
|
would not inherit this place.
|
|
|
|
|
sorted_key (string) : If None, the profiling results will be printed
|
|
|
|
|
in the order of first end time of events. Otherwise, the profiling
|
|
|
|
|
results will be sorted by the this flag. This flag should be one
|
|
|
|
|
of 'calls', 'total', 'max', 'min' or 'ave'.
|
|
|
|
|
state (str) : The profiling state, which should be one of 'CPU', 'GPU'
|
|
|
|
|
or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
|
|
|
|
|
both CPU and GPU; 'All' means profiling both CPU and GPU, and
|
|
|
|
|
generates timeline as well.
|
|
|
|
|
sorted_key (str, optional) : The order of profiling results, which
|
|
|
|
|
should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
|
|
|
|
|
Default is None, means the profiling results will be printed
|
|
|
|
|
in the order of first end time of events.
|
|
|
|
|
The `calls` means sorting by the number of calls.
|
|
|
|
|
The `total` means sorting by the total execution time.
|
|
|
|
|
The `max` means sorting by the maximum execution time.
|
|
|
|
|
The `min` means sorting by the minimum execution time.
|
|
|
|
|
The `ave` means sorting by the average execution time.
|
|
|
|
|
profile_path (string) : If state == 'All', it will write a profile
|
|
|
|
|
proto output file.
|
|
|
|
|
profile_path (str, optional) : If state == 'All', it will generate timeline,
|
|
|
|
|
and write it into `profile_path`. The default profile_path is `/tmp/profile`.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
|
|
|
|
@ -266,7 +261,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
|
|
|
|
|
|
|
|
|
|
epoc = 8
|
|
|
|
|
dshape = [4, 3, 28, 28]
|
|
|
|
|
data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
|
|
|
|
|
data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
|
|
|
|
|
conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
|
|
|
|
|
|
|
|
|
|
place = fluid.CPUPlace()
|
|
|
|
@ -277,6 +272,44 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
|
|
|
|
|
for i in range(epoc):
|
|
|
|
|
input = np.random.random(dshape).astype('float32')
|
|
|
|
|
exe.run(fluid.default_main_program(), feed={'data': input})
|
|
|
|
|
|
|
|
|
|
Examples Results:
|
|
|
|
|
|
|
|
|
|
.. code-block:: text
|
|
|
|
|
|
|
|
|
|
#### Examples Results ####
|
|
|
|
|
#### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
|
|
|
|
|
# The only difference in 5 sorted_key results is the following sentense:
|
|
|
|
|
# "Sorted by number of xxx in descending order in the same thread."
|
|
|
|
|
# The reason is that in this example, above 5 columns are already sorted.
|
|
|
|
|
-------------------------> Profiling Report <-------------------------
|
|
|
|
|
|
|
|
|
|
Place: CPU
|
|
|
|
|
Time unit: ms
|
|
|
|
|
Sorted by total time in descending order in the same thread
|
|
|
|
|
#Sorted by number of calls in descending order in the same thread
|
|
|
|
|
#Sorted by number of max in descending order in the same thread
|
|
|
|
|
#Sorted by number of min in descending order in the same thread
|
|
|
|
|
#Sorted by number of avg in descending order in the same thread
|
|
|
|
|
|
|
|
|
|
Event Calls Total Min. Max. Ave. Ratio.
|
|
|
|
|
thread0::conv2d 8 129.406 0.304303 127.076 16.1758 0.983319
|
|
|
|
|
thread0::elementwise_add 8 2.11865 0.193486 0.525592 0.264832 0.016099
|
|
|
|
|
thread0::feed 8 0.076649 0.006834 0.024616 0.00958112 0.000582432
|
|
|
|
|
|
|
|
|
|
#### 2) sorted_key = None ####
|
|
|
|
|
# Since the profiling results are printed in the order of first end time of Ops,
|
|
|
|
|
# the printed order is feed->conv2d->elementwise_add
|
|
|
|
|
-------------------------> Profiling Report <-------------------------
|
|
|
|
|
|
|
|
|
|
Place: CPU
|
|
|
|
|
Time unit: ms
|
|
|
|
|
Sorted by event first end time in descending order in the same thread
|
|
|
|
|
|
|
|
|
|
Event Calls Total Min. Max. Ave. Ratio.
|
|
|
|
|
thread0::feed 8 0.077419 0.006608 0.023349 0.00967738 0.00775934
|
|
|
|
|
thread0::conv2d 8 7.93456 0.291385 5.63342 0.99182 0.795243
|
|
|
|
|
thread0::elementwise_add 8 1.96555 0.191884 0.518004 0.245693 0.196998
|
|
|
|
|
"""
|
|
|
|
|
start_profiler(state)
|
|
|
|
|
yield
|
|
|
|
|