diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 03323e3da7..26c09eb8eb 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -66,6 +66,7 @@ void FetchOpHandle::RunImpl() {
     if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
       TensorCopy(t, cpu, *dev_ctx_[t.place()], &tensors_[i]);
+      dev_ctx_[t.place()]->Wait();
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 07a4b89217..63affb7054 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -33,9 +33,6 @@ std::string OpHandleBase::DebugString() const {
 
 OpHandleBase::~OpHandleBase() {
 #ifdef PADDLE_WITH_CUDA
-  for (auto &ctx : dev_ctx_) {
-    ctx.second->Wait();
-  }
   for (auto &ev : events_) {
     PADDLE_ENFORCE(cudaEventDestroy(ev.second));
   }