Extend current profiler for timeline and more features.

8 years ago · b9ec24c6e9
parent 2c89d97538
commit b9ec24c6e9
24 changed files with 699 additions and 38 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -146,6 +146,7 @@ include(external/cares)
 include(external/grpc)

 include(cudnn)              # set cudnn libraries, must before configure
+include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)

 if(NOT WITH_GPU)
    add_definitions(-DHPPL_STUB_FUNC)
+    add_definitions("-DCUPTI_LIB_PATH=\"\"")

    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
@ -73,7 +74,14 @@ else()
    if(NOT CUDNN_FOUND)
        message(FATAL_ERROR "Paddle needs cudnn to compile")
    endif()
-
+    if(CUPTI_FOUND)
+        include_directories(${CUPTI_INCLUDE_DIR})
+        add_definitions(-DPADDLE_WITH_CUPTI)
+        add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
+    else()
+        add_definitions("-DCUPTI_LIB_PATH=\"\"")
+        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
+    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")

    # Include cuda and cudnn
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -155,7 +155,8 @@ endif()
 include_directories(${CUDA_INCLUDE_DIRS})
 list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
-    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    # TODO(panyx0718): CUPTI only allows DSO?
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
 endif(NOT WITH_DSO)

 # setting nvcc arch flags
--- a/cmake/cupti.cmake
+++ b/cmake/cupti.cmake
@ -0,0 +1,41 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+
+set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
+find_path(CUPTI_INCLUDE_DIR cupti.h
+        PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
+        $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
+        NO_DEFAULT_PATH
+        )
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(APPEND CUPTI_CHECK_LIBRARY_DIRS
+        ${CUPTI_ROOT}
+        ${CUPTI_ROOT}/lib64
+        ${CUPTI_ROOT}/lib
+        ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+        $ENV{CUPTI_ROOT}
+        $ENV{CUPTI_ROOT}/lib64
+        $ENV{CUPTI_ROOT}/lib
+        /usr/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
+find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
+       PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
+       NO_DEFAULT_PATH
+       DOC "Path to cuPTI library.")
+
+get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
+if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
+    set(CUPTI_FOUND ON)
+else()
+    set(CUPTI_FOUND OFF)
+endif()
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -127,7 +127,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);

    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
+    // TODO(panyx0718): Need a program id to distinguish programs.
+    platform::RecordEvent record_event(op->Type(), pool.Get(place_),
+                                       op_desc->Block()->ID());

    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@ -167,4 +167,6 @@ message BlockDesc {
 // Please refer to
 // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
 // for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
 message ProgramDesc { repeated BlockDesc blocks = 1; }
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@ -125,6 +125,8 @@ class OpDesc {

  BlockDesc *Block() { return this->block_; }

+  const BlockDesc &BlockRef() const { return *this->block_; }
+
  void SetBlock(BlockDesc *block) { this->block_ = block; }

 private:
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -1,3 +1,5 @@
+proto_library(profiler_proto SRCS profiler.proto)
+
 if(WITH_GPU)
  cc_library(enforce SRCS enforce.cc DEPS)
 else()
@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
 nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)

-cc_library(profiler SRCS profiler.cc DEPS device_context)
+cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

 nv_test(float16_gpu_test SRCS float16_test.cu)
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/profiler.pb.h"
+
+namespace paddle {
+namespace platform {
+
+///////////////////////
+// WARN: Under Development. Don't depend on it yet.
+//////////////////////
+
+// DeviceTracer performs the following tasks:
+// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
+// 2. Collect cuda statistics: start/end ts, memory, etc.
+// 3. Generate a protobuf for further analysis.
+class DeviceTracer {
+ public:
+  struct KernelRecord {
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+  };
+
+  virtual ~DeviceTracer() {}
+  // Needs to be called once before use.
+  virtual void Enable() = 0;
+  // Needs to be called once after use.
+  virtual void Disable() = 0;
+
+  // Add a pair to correlate internal cuda id with high level
+  // annotation (string). So cuda statistics can be represented by
+  // human-readable annotations.
+  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
+
+  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
+  // added before for human readability.
+  virtual void AddKernelRecords(uint64_t start, uint64_t end,
+                                uint32_t device_id, uint32_t stream_id,
+                                uint32_t correlation_id) = 0;
+
+  // Generate a proto after done (Disabled).
+  virtual proto::Profile GenProfile() = 0;
+
+  virtual bool IsEnabled() = 0;
+};
+
+// Get a DeviceTracer.
+DeviceTracer* GetDeviceTracer();
+
+// Set a name for the cuda kernel operation being launched by the thread.
+void SetCurAnnotation(const char* anno);
+// Clear the name after the operation is done.
+void ClearCurAnnotation();
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@ -1,4 +1,8 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
-        DEPS dynamic_loader)
+
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+if (CUPTI_FOUND)
+    list(APPEND CUDA_SRCS cupti.cc)
+endif(CUPTI_FOUND)
+nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag cupti_dso_flag;
+void *cupti_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUPTI_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUPTI
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUPTI
+#include <cuda.h>
+#include <cupti.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cupti_dso_flag;
+extern void *cupti_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
+      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
+      std::call_once(cupti_dso_flag,                               \
+                     paddle::platform::dynload::GetCUPTIDsoHandle, \
+                     &cupti_dso_handle);                           \
+      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
+  struct DynLoad__##__name {                               \
+    template <typename... Args>                            \
+    inline CUptiResult CUPTIAPI operator()(Args... args) { \
+      return __name(args...);                              \
+    }                                                      \
+  };                                                       \
+  extern DynLoad__##__name __name
+#endif
+
+#define CUPTI_ROUTINE_EACH(__macro)           \
+  __macro(cuptiActivityEnable);               \
+  __macro(cuptiActivityDisable);              \
+  __macro(cuptiActivityRegisterCallbacks);    \
+  __macro(cuptiActivityGetAttribute);         \
+  __macro(cuptiActivitySetAttribute);         \
+  __macro(cuptiGetTimestamp);                 \
+  __macro(cuptiActivityGetNextRecord);        \
+  __macro(cuptiGetResultString);              \
+  __macro(cuptiActivityGetNumDroppedRecords); \
+  __macro(cuptiActivityFlushAll);             \
+  __macro(cuptiFinalize);                     \
+  __macro(cuptiSubscribe);                    \
+  __macro(cuptiUnsubscribe);                  \
+  __macro(cuptiEnableCallback);
+
+CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUPTI
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "",
              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
              "dlopen will search cuda from LD_LIBRARY_PATH");

+DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {

+static const char* cupti_lib_path = CUPTI_LIB_PATH;
+
 static inline std::string join(const std::string& part1,
                               const std::string& part2) {
  // directory separator
@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) {
 #endif
 }

+void GetCUPTIDsoHandle(void** dso_handle) {
+  std::string cupti_path = cupti_lib_path;
+  if (!FLAGS_cupti_dir.empty()) {
+    cupti_path = FLAGS_cupti_dir;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+#else
+  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+#endif
+}
+
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle);
 */
 void GetCUDNNDsoHandle(void** dso_handle);

+void GetCUPTIDsoHandle(void** dso_handle);
+
 /**
 * @brief    load the DSO of CURAND
 *
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include <iomanip>
 #include <map>
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
 #include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/string/printf.h"

 namespace paddle {
 namespace platform {
@ -126,15 +132,20 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }

-RecordEvent::RecordEvent(const std::string& name,
-                         const DeviceContext* dev_ctx) {
+RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx,
+                         int32_t block_id) {
  if (g_state == ProfilerState::kDisabled) return;
  dev_ctx_ = dev_ctx;
  name_ = name;
  PushEvent(name_, dev_ctx_);
+
+  full_name_ = string::Sprintf("%s_b%d", name, block_id);
+  // Maybe need the same push/pop behavior.
+  SetCurAnnotation(full_name_.c_str());
 }

 RecordEvent::~RecordEvent() {
+  ClearCurAnnotation();
  if (g_state == ProfilerState::kDisabled) return;
  PopEvent(name_, dev_ctx_);
 }
@ -147,7 +158,14 @@ void EnableProfiler(ProfilerState state) {
                 "The profiling state should be disabled when calling ",
                 "EnableProfiler.");
  g_state = state;
-  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
+  if (g_state == ProfilerState::kCUDA) {
+    g_profiler_place = "CUDA";
+  } else if (g_state == ProfilerState::kCPU) {
+    g_profiler_place = "CPU";
+  } else {
+    g_profiler_place = "All";
+    GetDeviceTracer()->Enable();
+  }
 #ifdef PADDLE_WITH_CUDA
  if (g_state == ProfilerState::kCUDA) {
    // Generate some dummy evenets first to reduce the startup overhead.
@ -190,6 +208,12 @@ void DisableProfiler(EventSortingKey sorted_key) {
  Mark("_stop_profiler_", nullptr);
  g_state = ProfilerState::kDisabled;

+  DeviceTracer* tracer = GetDeviceTracer();
+  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
+    tracer->Disable();
+    tracer->GenProfile();
+  }
+
  std::vector<std::vector<Event>> all_events = GetAllEvents();
  ParseEvents(all_events, sorted_key);
  ResetProfiler();
@ -254,9 +278,11 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
        }

        if (rit != pushed_events.rend()) {
-          double event_time = (g_profiler_place == "CUDA")
-                                  ? rit->CudaElapsedMs(events[i][j])
-                                  : rit->CpuElapsedMs(events[i][j]);
+          double event_time =
+              (g_profiler_place == "CUDA" || g_profiler_place == "All")
+                  ? rit->CudaElapsedMs(events[i][j])
+                  : rit->CpuElapsedMs(events[i][j]);
+
          std::string event_name =
              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
          max_name_width = std::max(max_name_width, event_name.size());
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.pb.h"

 namespace paddle {
 namespace platform {
@ -93,6 +94,7 @@ enum ProfilerState {
  kDisabled,  // disabled state
  kCPU,       // CPU profiling state
  kCUDA,      // GPU profiling state
+  kAll,       // Profile both CPU and GPU. (Currently experimental).
 };

 void Mark(const std::string& name, const DeviceContext* dev_ctx);
@ -102,7 +104,8 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);

 struct RecordEvent {
-  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
+  RecordEvent(const std::string& name, const DeviceContext* dev_ctx,
+              int32_t block_id);

  ~RecordEvent();

@ -110,9 +113,12 @@ struct RecordEvent {
  const DeviceContext* dev_ctx_;
  // Event name
  std::string name_;
+  // Need to distinguish name by op type, block_id, program_id and perhaps
+  // different kernel invocations within an op.
+  std::string full_name_;
 };

-// Return the event list of all threads. Asummed the returned value calls
+// Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();

--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.platform.proto;
+
+message Event {
+  optional string name = 1;
+  optional uint64 start_ns = 2;
+  optional uint64 end_ns = 3;
+  optional uint32 device_id = 5;
+  optional uint32 stream_id = 6;
+}
+
+message Profile {
+  repeated Event events = 1;
+  optional uint64 start_ns = 2;
+  optional uint64 end_ns = 3;
+}
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@ -95,7 +95,7 @@ TEST(RecordEvent, RecordEvent) {
   */
  for (int i = 1; i < 5; ++i) {
    std::string name = "evs_op_" + std::to_string(i);
-    RecordEvent record_event(name, dev_ctx);
+    RecordEvent record_event(name, dev_ctx, 0);
    int counter = 1;
    while (counter != i * 1000) counter++;
  }
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@ -459,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle.
      .value("kDisabled", platform::ProfilerState::kDisabled)
      .value("kCPU", platform::ProfilerState::kCPU)
      .value("kCUDA", platform::ProfilerState::kCUDA)
+      .value("kAll", platform::ProfilerState::kAll)
      .export_values();

  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@ -97,9 +97,14 @@ def profiler(state, sorted_key=None):
            The `ave` means sorting by the average execution time.
    """

-    if state not in ['CPU', 'GPU']:
-        raise ValueError("The state must be 'CPU' or 'GPU'.")
-    prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
+    if state not in ['CPU', 'GPU', "All"]:
+        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
+    if state == "GPU":
+        prof_state = core.ProfilerState.kCUDA
+    elif state == "CPU":
+        prof_state = core.ProfilerState.kCPU
+    else:
+        prof_state = core.ProfilerState.kAll
    core.enable_profiler(prof_state)
    yield

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -41,6 +41,7 @@ list(REMOVE_ITEM TEST_OPS test_while_op)
 list(REMOVE_ITEM TEST_OPS test_lod_array_length_op)
 list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor)
 list(REMOVE_ITEM TEST_OPS test_profiler)
+list(REMOVE_ITEM TEST_OPS test_nvprof)
 list(REMOVE_ITEM TEST_OPS test_normalization_wrapper)
 list(REMOVE_ITEM TEST_OPS test_executor_and_mul)
 list(REMOVE_ITEM TEST_OPS test_assign_value_op)
@ -75,6 +76,7 @@ py_test_modules(test_while_op MODULES test_while_op)
 py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op)
 py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor)
 py_test_modules(test_profiler MODULES test_profiler)
+py_test_modules(test_nvprof MODULES test_nvprof)
 py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper)
 py_test_modules(test_executor_and_mul MODULES test_executor_and_mul)
 py_test_modules(test_assign_value_op MODULES test_assign_value_op)
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@ -22,27 +22,9 @@ import paddle.fluid.core as core


 class TestProfiler(unittest.TestCase):
-    def test_nvprof(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        epoc = 8
-        dshape = [4, 3, 28, 28]
-        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
-        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        output_file = 'cuda_profiler.txt'
-        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
-            for i in range(epoc):
-                input = np.random.random(dshape).astype('float32')
-                exe.run(fluid.default_main_program(), feed={'data': input})
-        os.remove(output_file)
-
    def net_profiler(self, state):
-        if state == 'GPU' and not core.is_compiled_with_cuda():
+        enable_if_gpu = state == 'GPU' or state == "All"
+        if enable_if_gpu and not core.is_compiled_with_cuda():
            return
        startup_program = fluid.Program()
        main_program = fluid.Program()
@ -85,6 +67,9 @@ class TestProfiler(unittest.TestCase):
    def test_cuda_profiler(self):
        self.net_profiler('GPU')

+    def test_all_profiler(self):
+        self.net_profiler('All')
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/unittests/test_nvprof.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_nvprof.py
@ -0,0 +1,46 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+
+
+class TestNVProf(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        output_file = 'cuda_profiler.txt'
+        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+        os.remove(output_file)
+
+
+if __name__ == '__main__':
+    unittest.main()