diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 8a8f15a4fa..9b09a03f2b 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -48,10 +48,10 @@ if(WITH_GPU)
     nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
     add_dependencies(tensor tensor_util)
   else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
   endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index d9c76881b7..67054eccb3 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -165,7 +165,7 @@ template <typename T>
 class GreaterThanChecker {
  public:
   explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
     PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
   }
 
@@ -177,7 +177,7 @@ template <typename T>
 class EqualGreaterThanChecker {
  public:
   explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
     PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
   }
 
@@ -193,7 +193,7 @@ class DefaultValueSetter {
  public:
   explicit DefaultValueSetter(T default_value)
       : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }  // NOLINT
+  void operator()(T* value) const { *value = default_value_; }
 
  private:
   T default_value_;
@@ -203,7 +203,7 @@ template <typename T>
 class EnumInContainer {
  public:
   explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
-  void operator()(T& val) const {
+  void operator()(const T& val) const {
     PADDLE_ENFORCE(container_.find(val) != container_.end(),
                    "Value %s is not in enum container %s", val,
                    ContainerDebugString());
@@ -232,7 +232,8 @@ class EnumInContainer {
 // an attribute can have more than one limits
 template <typename T>
 class TypedAttrChecker {
-  typedef std::function<void(T&)> ValueChecker;
+  typedef std::function<void(T*)> DefaultValueChecker;
+  typedef std::function<void(const T&)> ValueChecker;
 
  public:
   explicit TypedAttrChecker(const std::string& attr_name)
@@ -268,17 +269,17 @@ class TypedAttrChecker {
     return *this;
   }
 
-  void operator()(AttributeMap& attr_map) const {  // NOLINT
-    if (!attr_map.count(attr_name_)) {
+  void operator()(AttributeMap* attr_map) const {
+    if (!attr_map->count(attr_name_)) {
       // user do not set this attr
       PADDLE_ENFORCE(!default_value_setter_.empty(),
                      "Attribute '%s' is required!", attr_name_);
       // default_value_setter_ has no more than one element
       T val;
-      (default_value_setter_[0])(val);
-      attr_map[attr_name_] = val;
+      (default_value_setter_[0])(&val);
+      (*attr_map)[attr_name_] = val;
     }
-    Attribute& attr = attr_map.at(attr_name_);
+    Attribute& attr = attr_map->at(attr_name_);
     ExtractAttribute<T> extract_attr(attr_name_);
     T* attr_value = extract_attr(attr);
     for (const auto& checker : value_checkers_) {
@@ -289,12 +290,12 @@ class TypedAttrChecker {
  private:
   std::string attr_name_;
   std::vector<ValueChecker> value_checkers_;
-  std::vector<ValueChecker> default_value_setter_;
+  std::vector<DefaultValueChecker> default_value_setter_;
 };
 
 // check whether op's all attributes fit their own limits
 class OpAttrChecker {
-  typedef std::function<void(AttributeMap&)> AttrChecker;
+  typedef std::function<void(AttributeMap*)> AttrChecker;
 
  public:
   template <typename T>
@@ -304,7 +305,7 @@ class OpAttrChecker {
     return *(checker.target<TypedAttrChecker<T>>());
   }
 
-  void Check(AttributeMap& attr_map) const {  // NOLINT
+  void Check(AttributeMap* attr_map) const {
     for (const auto& checker : attr_checkers_) {
       checker(attr_map);
     }
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index a6d583777a..2cbfa24951 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -355,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
           BuildStrategy::GradientScaleStrategy::kCustomized) {
         // TODO(paddle-dev): Why is there no input for this op_handle?
         auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
+        auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
+        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
+                              out_dtype);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -658,14 +660,14 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
 
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
     ir::Graph *result, const std::string &loss_grad_name,
-    ir::Node *out_var_node) const {
+    ir::Node *out_var_node, proto::VarType::Type dtype) const {
   size_t num_parallel_devices = Get<size_t>("num_parallel_devices");
   for (size_t i = 0; i < places_.size(); ++i) {
     // Insert ScaleCost OpHandle
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
     auto *op_handle = new ScaleLossGradOpHandle(
         result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        num_parallel_devices, local_scopes_[i], places_[i], dev_ctx);
+        num_parallel_devices, local_scopes_[i], places_[i], dev_ctx, dtype);
     result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 0556232aa4..5736102ddc 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 
   void CreateScaleLossGradOp(ir::Graph *result,
                              const std::string &loss_grad_name,
-                             ir::Node *out_var_node) const;
+                             ir::Node *out_var_node,
+                             proto::VarType::Type dtype) const;
 
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index ef16265997..e1b8e8fe05 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -22,39 +22,66 @@ namespace details {
 ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
                                              Scope *scope,
                                              platform::Place place,
-                                             platform::DeviceContext *dev_ctx)
+                                             platform::DeviceContext *dev_ctx,
+                                             proto::VarType::Type dtype)
     : OpHandleBase(node),
       coeff_(static_cast<float>(1.0 / num_dev)),
       scope_(scope),
-      place_(place) {
+      place_(place),
+      out_dtype_(dtype) {
   this->SetDeviceContext(place_, dev_ctx);
 }
 
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
+struct ScaleLossGradFunctor {
+  float coeff_;
+  Tensor *out_;
+  platform::Place place_;
+  OpHandleBase *op_handle_;
+  proto::VarType::Type out_dtype_;
+  platform::DeviceContext *ctx_;
+
+  ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
+                       OpHandleBase *op_handle, proto::VarType::Type dtype,
+                       platform::DeviceContext *ctx)
+      : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto *out_data = out_->mutable_data<OutT>(place_);
+    if (platform::is_cpu_place(place_)) {
+      *out_data = static_cast<OutT>(coeff_);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      OutT cast_coeff = static_cast<OutT>(coeff_);
+      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
+                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
+                   stream);
+      VLOG(10) << place_ << "RUN Scale loss grad op";
+
+#endif
+    }
+  }
+};
+
 void ScaleLossGradOpHandle::RunImpl() {
   // Doesn't wait any event
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
   auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-  float *tmp = local_scope.FindVar(var_name)
-                   ->GetMutable<LoDTensor>()
-                   ->mutable_data<float>(make_ddim({1}), place_);
+  auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
+  tensor->Resize(make_ddim({1}));
 
-  if (platform::is_cpu_place(place_)) {
-    *tmp = coeff_;
-  } else {
 #ifdef PADDLE_WITH_CUDA
-    this->RunAndRecordEvent([&] {
-      auto stream = static_cast<platform::CUDADeviceContext *>(
-                        this->dev_ctxes_.at(place_))
-                        ->stream();
-      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
-                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(10) << place_ << "RUN Scale loss grad op";
-    });
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
+                            this->dev_ctxes_.at(place_));
+  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
+#else
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
+  framework::VisitDataType(out_dtype_, func);
 #endif
-  }
 }
 
 std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index 523b55724c..8bedd1643e 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -26,8 +26,8 @@ namespace details {
 
 struct ScaleLossGradOpHandle : public OpHandleBase {
   ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
-                        platform::Place place,
-                        platform::DeviceContext *context);
+                        platform::Place place, platform::DeviceContext *context,
+                        proto::VarType::Type dtype);
 
   ~ScaleLossGradOpHandle() final;
 
@@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
   float coeff_;
   Scope *scope_;
   platform::Place place_;
+  proto::VarType::Type out_dtype_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 2fe1c94ec0..0e7b0cbeb9 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() {
     // not by users.
     return;
   }
-  checker->Check(attrs_);
+  checker->Check(&attrs_);
 }
 
 void OpDesc::InferShape(const BlockDesc &block) const {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index ca31303f77..2311614c33 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -82,6 +82,10 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
   AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
       .SetDefault("");
 
+  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
+                                    "Callstack for Op Creatation.")
+      .SetDefault({});
+
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 4c59c73d87..0a0f8f4655 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -47,6 +47,7 @@ class OpProtoAndCheckerMaker {
   static const char *OpRoleAttrName() { return "op_role"; }
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
   static const char *OpNamescopeAttrName() { return "op_namescope"; }
+  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
index bfc411ca2c..346d14d408 100644
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -24,7 +24,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
     const VariableNameMap& outputs, AttributeMap attrs) {
   auto& info = OpInfoMap::Instance().Get(type);
   if (info.Checker() != nullptr) {
-    info.Checker()->Check(attrs);
+    info.Checker()->Check(&attrs);
   }
   auto op = info.Creator()(type, inputs, outputs, attrs);
   return std::unique_ptr<OperatorBase>(op);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index fec311e3ee..ac2828136b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -16,10 +16,15 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <algorithm>
-
+#include <sstream>
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
@@ -157,27 +162,59 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(4) << place << " " << DebugStringEx(&scope);
-  if (platform::is_gpu_place(place)) {
+  try {
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << place << " " << DebugStringEx(&scope);
+    }
+    if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-    PADDLE_THROW("Cannot run operator on place %s", place);
+      PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-    platform::SetDeviceId(dev_id);
+      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+      platform::SetDeviceId(dev_id);
 #endif
-  }
+    }
 
-  // The profile has a process-wide mutex, results in serious performance issue
-  // in concurrency scenerio. Here use an `if` to fix this issue.
-  // Please not remove the `if`, ask @Superjomn if there are any concern.
-  if (platform::IsProfileEnabled()) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-    RunImpl(scope, place);
-  } else {
-    RunImpl(scope, place);
+    // The profile has a process-wide mutex, results in serious performance
+    // issue
+    // in concurrency scenerio. Here use an `if` to fix this issue.
+    // Please not remove the `if`, ask @Superjomn if there are any concern.
+    if (platform::IsProfileEnabled()) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::RecordEvent record_event(Type(), pool.Get(place));
+      RunImpl(scope, place);
+    } else {
+      RunImpl(scope, place);
+    }
+
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << place << " " << DebugStringEx(&scope);
+    }
+  } catch (platform::EnforceNotMet exception) {
+    if (Attrs().count("sub_block") != 0) {
+      throw exception;
+    }
+
+    auto& callstack = Attr<std::vector<std::string>>(
+        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+
+    if (callstack.empty()) {
+      throw exception;
+    }
+    std::ostringstream sout;
+    sout << "Invoke operator " << Type() << " error.\n";
+    sout << "Python Callstacks: \n";
+    for (auto& line : callstack) {
+      sout << line;
+    }
+    sout << "C++ Callstacks: \n";
+    sout << exception.err_str_;
+    exception.err_str_ = sout.str();
+    throw exception;
+  } catch (...) {
+    std::rethrow_exception(std::current_exception());
   }
-  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -1061,8 +1098,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
-                         ipt_name, DebugString());
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
+                         ipt_name);
           int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 5b09cad06c..ef096c2b81 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -28,8 +28,7 @@ void Tensor::check_memory_size() const {
       "or maybe the required data-type mismatches the data already stored.");
 }
 
-Tensor::Tensor(std::type_index type)
-    : type_(framework::ToDataType(type)), offset_(0) {}
+Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
 
 size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 2e110133a3..40606d9b06 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,7 +69,7 @@ class Tensor {
  public:
   Tensor() : type_(proto::VarType::FP32), offset_(0) {}
 
-  explicit Tensor(std::type_index type);
+  explicit Tensor(const proto::VarType::Type&);
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index cab6d9b67e..871c7bd2a7 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 
 namespace paddle {
 namespace framework {
@@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
                src_ptr, size);
 }
 
+template <typename T>
+paddle::framework::Tensor GetTensor(
+    memory::allocation::AllocationPtr temp_allocation_ptr,
+    const framework::DDim& dim) {
+  auto& deleter = temp_allocation_ptr.get_deleter();
+  auto* allocation_ptr = temp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
+
+  PADDLE_ENFORCE(
+      dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+      "The AllocationPtr must be TemporaryAllocation.");
+  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                    framework::product(dim) * sizeof(T));
+
+  paddle::framework::Tensor temp_tensor(
+      framework::ToDataType(std::type_index(typeid(T))));
+  temp_tensor.Resize(dim);
+  temp_tensor.ResetHolder(std::move(shared_allocation));
+  return temp_tensor;
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 4a7b31c7d4..2519f5e7ac 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
 
 namespace paddle {
 namespace operators {
@@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
       auto tmp_allocation_ptr =
           platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
               framework::product(col_shape) * sizeof(T));
-      Tensor tep_tensor =
-          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
-
-      col.ShareDataWith(tep_tensor);
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
@@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       auto tmp_allocation_ptr =
           platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
               framework::product(col_shape) * sizeof(T));
-      Tensor tep_tensor =
-          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
-
-      col.ShareDataWith(tep_tensor);
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 1a149298fd..ae669f5525 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 833c407282..50b2322b17 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    elementwise_mul, ops::ElementwiseMulKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index 9538177460..e80a703c30 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
@@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::float16>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index b10a19b658..e925e7bb59 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -131,8 +131,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int in_col = input[0].numel() / in_row;
     int out_row = in_row, out_col = 0;
 
-    std::vector<T*> inputs_data(in_num);
+    std::vector<const T*> inputs_data;
     std::vector<int> inputs_col(in_num + 1);
+    inputs_data.reserve(in_num);
 
     inputs_col[0] = 0;
     bool sameShape = true;
@@ -143,7 +144,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
       }
       out_col += t_cols;
       inputs_col[i + 1] = out_col;
-      inputs_data[i] = const_cast<T*>(input[i].data<T>());
+      inputs_data.emplace_back(input[i].data<T>());
     }
 
     // computation
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
index b255d2a7c4..4682940f7e 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ b/paddle/fluid/operators/metrics/accuracy_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <thrust/reduce.h>
 #include "paddle/fluid/operators/metrics/accuracy_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
@@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_CUDA_KERNEL(accuracy,
-                        paddle::operators::AccuracyOpCUDAKernel<float>,
-                        paddle::operators::AccuracyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
+    paddle::operators::AccuracyOpCUDAKernel<double>,
+    paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cu b/paddle/fluid/operators/optimizers/momentum_op.cu
index 8ce739de8d..7f9e724640 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/momentum_op.cu
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 71f079e4d9..f6ef83c3ba 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -237,7 +237,8 @@ class SparseMomentumFunctor<T, UseNesterov> {
   inline HOSTDEVICE void operator()(size_t i) {
     auto row_idx =
         math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
+                       : static_cast<T>(0);
     // put memory access in register
     const T p = p_[i];
     const T lr = lr_[0];
@@ -282,7 +283,8 @@ class SparseMomentumFunctor<T, NoNesterov> {
   inline HOSTDEVICE void operator()(size_t i) {
     auto row_idx =
         math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
+                       : static_cast<T>(0);
     // put memory access in register
     const T p = p_[i];
     const T lr = lr_[0];
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
index 8fceed3558..57d6f4b3ea 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@@ -52,7 +52,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The maximum length of the sequence. If maxlen < 0, maxlen "
                  "= max(Input(X)).")
         .SetDefault(-1)
-        .AddCustomChecker([](int &v) {
+        .AddCustomChecker([](const int &v) {
           PADDLE_ENFORCE(v < 0 || v >= 1,
                          "Attr(maxlen) must be less than 0 or larger than 1");
         });
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index c17d1afc30..9e77f7252d 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -21,7 +21,7 @@ class TopkOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of TopkOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -44,12 +44,25 @@ class TopkOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("X", "Out");
     ctx->ShareLoD("X", "Indices");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context(), layout_, library_);
+  }
 };
 
 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) The input of Topk op");
+    AddInput("K",
+             "(Tensor)  Number of top elements to look for along "
+             "the last dimension (along each row for matrices).")
+        .AsDispensable();
     AddOutput("Out", "(Tensor) The output tensor of Topk op");
     AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 0cad224ca8..c27039dd0a 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - (*beam)) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(-INFINITY, -1);
+          topk[k].set(-static_cast<T>(INFINITY), -1);
         }
       }
       if (!(*is_empty)) {
@@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
     }
 
     *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
+    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
     *beam = 0;
   }
 }
@@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - *beam) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(-INFINITY, -1);
+          topk[k].set(-static_cast<T>(INFINITY), -1);
         }
       }
       if (!(*is_empty)) {
@@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
     bool firststep = true;
 
     for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-INFINITY, -1);
+      topk[j].set(-static_cast<T>(INFINITY), -1);
     }
     while (top_num) {
       ThreadGetTopK<T, MaxLength, BlockSize>(
@@ -326,6 +327,17 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     auto* indices = ctx.Output<Tensor>("Indices");
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      Tensor k_host;
+      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
+      k = k_host.data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[output_dims.size() - 1] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
     const T* input_data = input->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     // FIXME(typhoonzero): data is always converted to type T?
@@ -362,5 +374,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
-                        paddle::operators::TopkOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    top_k, paddle::operators::TopkOpCUDAKernel<float>,
+    paddle::operators::TopkOpCUDAKernel<double>,
+    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 76ece57b39..f7bac67300 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -37,8 +37,16 @@ class TopkKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
-    // k is determined by Attr
-    const size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[output_dims.size() - 1] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
 
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
diff --git a/paddle/fluid/platform/create_tensor_with_allocationptr.h b/paddle/fluid/platform/create_tensor_with_allocationptr.h
deleted file mode 100644
index 00fcc5f862..0000000000
--- a/paddle/fluid/platform/create_tensor_with_allocationptr.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/temporary_allocator.h"
-namespace paddle {
-namespace platform {
-
-template <typename T>
-paddle::framework::Tensor GetTensor(
-    memory::allocation::AllocationPtr temp_allocation_ptr,
-    const framework::DDim &dim) {
-  auto &deleter = temp_allocation_ptr.get_deleter();
-  auto *allocation_ptr = temp_allocation_ptr.release();
-  auto shared_allocation =
-      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
-
-  PADDLE_ENFORCE(dynamic_cast<TemporaryAllocation *>(allocation_ptr) != nullptr,
-                 "The AllocationPtr must be TemporaryAllocation.");
-  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
-                    framework::product(dim) * sizeof(T));
-
-  paddle::framework::Tensor temp_tensor(std::type_index(typeid(T)));
-  temp_tensor.Resize(dim);
-  temp_tensor.ResetHolder(std::move(shared_allocation));
-  return temp_tensor;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 81c443d758..022afb686b 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
 
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
                           << ", CUDA Capability: " << compute_capability_
-                          << ", Driver Version: " << driver_version_ / 1000
+                          << ", Driver API Version: " << driver_version_ / 1000
                           << "." << (driver_version_ % 100) / 10
-                          << ", Runtime Version: " << runtime_version_ / 1000
-                          << "." << (runtime_version_ % 100) / 10;
+                          << ", Runtime API Version: "
+                          << runtime_version_ / 1000 << "."
+                          << (runtime_version_ % 100) / 10;
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index af9744dcb8..7e87580189 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -41,7 +41,28 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-/*! \brief device temporary allocator singleton */
+/*! \brief device temporary allocator singleton.
+ *
+ * Some operator needs temporary memory during computation, for example,
+ * conv_gemm, which needs use col to store the result of im2col. If we
+ * create a stack memory which is used by CUDA Kernel, before the
+ * Computation(...) returns, we should add ctx->Wait(), because the
+ * execution of CUDA is async, if there doesn't have ctx->Wait(),
+ * the temporary memory will be released before the CUDA Kernel uses
+ * it.
+ *
+ * DeviceTemporaryAllocator is a singleton, which contains a
+ * `TemporaryAllocator` for each <Place, Stream>. And the TemporaryAllocator
+ * contains a temp_allocation_queue which is used to store the temporary
+ * allocations. The allocation, which is allocated by TemporaryAllocator,
+ * is a unique_ptr,  and when it is not held by any variable, it will be
+ * pushed into the temp_allocation_queue. There are two opportunities to free
+ * the allocations of temp_allocation_queue:
+ *  - when the Stream calls cudaStreamSynchronize;
+ *  - when the allocation size of opportunities exceeds a certain threshold
+ *    (defined by FLAGS_limit_of_temporary_allocation).
+ *
+ * */
 class DeviceTemporaryAllocator {
  public:
   static DeviceTemporaryAllocator& Instance() {
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index e353658c7f..8df8e32098 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 
 #define NCCL_ID_VARNAME "NCCLID"
 
@@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclInt;
   } else if (type == framework::proto::VarType::INT64) {
     return ncclInt64;
+  } else if (type == framework::proto::VarType::FP16) {
+    return ncclFloat16;
   } else {
     PADDLE_THROW("Not supported");
   }
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index 4e32d2d695..812c4a3331 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation {
   memory::allocation::AllocationPtr underlying_allocation_;
 };
 
+/*! \brief the TemporaryAllocator is used to alloc the temporary allocation
+ * which used by CUDA's async operation.
+ *
+ * The TemporaryAllocator contains a temp_allocation_queue which
+ * is used to store the temporary allocations. The allocation, which is
+ * allocated by TemporaryAllocator, is a unique_ptr, and when it is not held
+ * by any variable, it will be pushed into the  temp_allocation_queue.
+ *
+ * There is one opportunity to free the allocations of temp_allocation_queue:
+ *   - when the allocation size of opportunities exceeds a certain threshold
+ *     (defined by FLAGS_limit_of_temporary_allocation).
+ *
+ * */
 class TemporaryAllocator : public memory::allocation::Allocator {
  public:
   explicit TemporaryAllocator(platform::Place place);
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
index 3b940b0e82..e4e5be5b89 100644
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
+#include "paddle/fluid/framework/tensor_util.h"
 DECLARE_double(limit_of_temporary_allocation);
 
 namespace paddle {
@@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) {
 
 TEST(temporary_allocator, add_callback) {
 #ifdef PADDLE_WITH_CUDA
+  const double limit = FLAGS_limit_of_temporary_allocation;
   FLAGS_limit_of_temporary_allocation = 10;
   platform::CUDAPlace gpu_place(0);
   TemporaryAllocator gpu_alloc(gpu_place);
@@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) {
   });
   { gpu_alloc.Allocate(100); }
   PADDLE_ENFORCE(deleted);
-  FLAGS_limit_of_temporary_allocation = -1;
+  FLAGS_limit_of_temporary_allocation = limit;
 #endif
 }
 
@@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     auto allocation = cpu_alloc.Allocate(memory_size);
     void* address = allocation->ptr();
     int numel = memory_size / sizeof(float);
-    framework::Tensor tensor =
-        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    framework::Tensor tensor = framework::GetTensor<float>(
+        std::move(allocation), framework::make_ddim({numel}));
     PADDLE_ENFORCE_EQ(address, tensor.data<float>());
     PADDLE_ENFORCE_EQ(tensor.numel(), numel);
   }
@@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     auto allocation = gpu_alloc.Allocate(memory_size);
     void* address = allocation->ptr();
     int numel = memory_size / sizeof(float);
-    framework::Tensor tensor =
-        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    framework::Tensor tensor = framework::GetTensor<float>(
+        std::move(allocation), framework::make_ddim({numel}));
     PADDLE_ENFORCE_EQ(address, tensor.data<float>());
     PADDLE_ENFORCE_EQ(tensor.numel(), numel);
   }
@@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     {
       auto allocation = cpu_alloc.Allocate(memory_size);
       address = allocation->ptr();
-      framework::Tensor tensor = GetTensor<float>(
+      framework::Tensor tensor = framework::GetTensor<float>(
           std::move(allocation), framework::make_ddim({numel}));
       PADDLE_ENFORCE_EQ(address, tensor.data<float>());
       PADDLE_ENFORCE_EQ(tensor.numel(), numel);
@@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     {
       auto allocation = gpu_alloc.Allocate(memory_size);
       address = allocation->ptr();
-      framework::Tensor tensor = GetTensor<float>(
+      framework::Tensor tensor = framework::GetTensor<float>(
           std::move(allocation), framework::make_ddim({numel}));
       PADDLE_ENFORCE_EQ(address, tensor.data<float>());
       PADDLE_ENFORCE_EQ(tensor.numel(), numel);
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 06d8b65fb1..f8ded9f94e 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -49,6 +49,9 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpNameScopeAttrName",
       framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpCreationCallstackAttrName",
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }
 
 }  // namespace pybind
diff --git a/paddle/scripts/installation_validate.py b/paddle/scripts/installation_validate.py
new file mode 100644
index 0000000000..f84e2f4b17
--- /dev/null
+++ b/paddle/scripts/installation_validate.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle as pd
+
+print(pd.__version__)
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2e6b40148d..418dc13468 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -79,6 +79,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib"
+            pip install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi
@@ -91,6 +92,7 @@ function cmake_gen() {
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
                 WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi
@@ -103,6 +105,7 @@ function cmake_gen() {
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
                 WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi
@@ -115,6 +118,7 @@ function cmake_gen() {
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
                 WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi
@@ -441,7 +445,9 @@ EOF
         # make install should also be test when unittest
         make install -j 8
         if [ "$1" == "cp27-cp27m" ]; then
+            set -e
             pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            python ${PADDLE_ROOT}/paddle/scripts/installation_validate.py
         elif [ "$1" == "cp35-cp35m" ]; then
             pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp36-cp36m" ]; then
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 13d2893fd1..af02721eb7 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object):
             self.dtype = 'int64'
         elif dtype == core.VarDesc.VarType.FP64:
             self.dtype = 'float64'
+        elif dtype == core.VarDesc.VarType.FP16:
+            self.dtype = 'float16'
         elif dtype == core.VarDesc.VarType.INT32:
             self.dtype = 'int32'
         elif dtype == core.VarDesc.VarType.UINT8:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index de30ed2fc5..3427fb0c4a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -20,6 +20,7 @@ import os
 import re
 import six
 import sys
+import traceback
 
 import numpy as np
 
@@ -604,6 +605,10 @@ class Operator(object):
         if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
             del op_attrs[role_var_name]
 
+        callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+        op_attrs[callstack_var_name] = list(
+            reversed(traceback.format_stack()))[1:]
+
         if len(self.desc.type()) != 0:
             return
         if type is None:
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index b37ebbe517..26d1f8f4d2 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -18,6 +18,7 @@ from . import framework
 import numpy as np
 import contextlib
 from .core import VarDesc
+from . import unique_name
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
@@ -207,16 +208,39 @@ class UniformInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         op = block._prepend_op(
             type="uniform_random",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": out_dtype,
                 "min": self._low,
                 "max": self._high,
                 "seed": self._seed
             })
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
         var.op = op
         return op
 
@@ -261,17 +285,39 @@ class NormalInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         op = block._prepend_op(
             type="gaussian_random",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": out_dtype,
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed,
                 "use_mkldnn": False
             })
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
         var.op = op
         return op
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bdfcc8c4e2..cc1fdbd285 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2801,6 +2801,10 @@ def batch_norm(input,
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
+    # use fp32 for bn parameter
+    if dtype == core.VarDesc.VarType.FP16:
+        dtype = core.VarDesc.VarType.FP32
+
     input_shape = input.shape
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
@@ -2835,7 +2839,7 @@ def batch_norm(input,
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
-        dtype=input.dtype)
+        dtype=dtype)
     mean.stop_gradient = True
 
     variance = helper.create_parameter(
@@ -2845,7 +2849,7 @@ def batch_norm(input,
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
-        dtype=input.dtype)
+        dtype=dtype)
     variance.stop_gradient = True
 
     # create output
@@ -4526,7 +4530,7 @@ def topk(input, k, name=None):
     Args:
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
-        k(int):  The number of top elements to look for along the last dimension
+        k(int | Variable):  The number of top elements to look for along the last dimension
                  of input.
         name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
@@ -4549,12 +4553,18 @@ def topk(input, k, name=None):
     helper = LayerHelper("top_k", **locals())
     values = helper.create_variable_for_type_inference(dtype=input.dtype)
     indices = helper.create_variable_for_type_inference(dtype="int64")
+    inputs = {"X": [input]}
+    attrs = None
+    if isinstance(k, Variable):
+        inputs['K'] = k
+    else:
+        attrs = {'k': k}
     helper.append_op(
         type="top_k",
-        inputs={"X": [input]},
+        inputs=inputs,
         outputs={"Out": [values],
                  "Indices": [indices]},
-        attrs={"k": k})
+        attrs=attrs)
     values.stop_gradient = True
     indices.stop_gradient = True
     return values, indices
@@ -7943,7 +7953,7 @@ def unstack(x, axis=0, num=None):
             num = x.shape[axis]
 
     outs = []
-    for _ in num:
+    for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
     helper.append_op(
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 76a707efdc..0fe836683b 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -368,6 +368,8 @@ class OpTest(unittest.TestCase):
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
                     return [place]
+                else:
+                    return []
             else:
                 return []
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index 1b2b53f2d4..5257b0be6f 100644
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -22,8 +22,10 @@ from op_test import OpTest
 class TestAccuracyOp(OpTest):
     def setUp(self):
         self.op_type = "accuracy"
+        self.dtype = np.float32
+        self.init_dtype()
         n = 8192
-        infer = np.random.random((n, 1)).astype("float32")
+        infer = np.random.random((n, 1)).astype(self.dtype)
         indices = np.random.randint(0, 2, (n, 1))
         label = np.random.randint(0, 2, (n, 1))
         self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
@@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest):
                     num_correct += 1
                     break
         self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype("float32"),
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
             'Correct': np.array([num_correct]).astype("int32"),
             'Total': np.array([n]).astype("int32")
         }
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestAccuracyOpFp16(TestAccuracyOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index cadaf1df53..15d4db590e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -21,14 +21,16 @@ from op_test import OpTest
 class ElementwiseDivOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
+        self.dtype = np.float32
+        self.init_dtype()
         """ Warning
         CPU gradient check error!
         'X': np.random.random((32,84)).astype("float32"),
         'Y': np.random.random((32,84)).astype("float32")
         """
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
         }
         self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
@@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest):
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
 
+    def init_dtype(self):
+        pass
+
 
 class TestElementwiseDivOp_scalar(ElementwiseDivOp):
     def setUp(self):
@@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
         }
 
 
+class TestElementwiseDivOpFp16(ElementwiseDivOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y'))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 57ba34f833..0484099188 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
         }
 
 
+class TestElementwiseMulOpFp16(ElementwiseMulOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
index eec73d0beb..20f1a110c3 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
@@ -22,12 +22,22 @@ from op_test import OpTest
 class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
-        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
+        self.dtype = np.float32
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
         self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index cf4346cf2e..77ec6f9b6b 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -24,11 +24,13 @@ from op_test import OpTest
 class TestMomentumOp1(OpTest):
     def setUp(self):
         self.op_type = "momentum"
+        self.dtype = np.float32
+        self.init_dtype()
 
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        velocity = np.zeros((123, 321)).astype("float32")
-        learning_rate = np.array([0.001]).astype("float32")
+        param = np.random.random((123, 321)).astype(self.dtype)
+        grad = np.random.random((123, 321)).astype(self.dtype)
+        velocity = np.zeros((123, 321)).astype(self.dtype)
+        learning_rate = np.array([0.001]).astype(self.dtype)
         mu = 0.0001
         use_nesterov = False
 
@@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest):
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestMomentumOpFp16(TestMomentumOp1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
 class TestMomentumOp2(OpTest):
     '''Test Momentum with default values for attributes
     '''
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 4153394c1d..37b9a9188a 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase):
             set(mul_op.attr_names),
             set([
                 "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "op_namescope"
+                "op_namescope", "op_callstack"
             ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index 69b29db83a..9fbf59ed66 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -21,15 +21,22 @@ from op_test import OpTest
 
 class TestTopkOp(OpTest):
     def setUp(self):
+        self.variable_k = False
         self.set_args()
         self.op_type = "top_k"
+        self.dtype = np.float32
+        self.init_dtype()
+
         k = self.top_k
-        input = np.random.random((self.row, k)).astype("float32")
+        input = np.random.random((self.row, k)).astype(self.dtype)
         output = np.ndarray((self.row, k))
         indices = np.ndarray((self.row, k)).astype("int64")
-
         self.inputs = {'X': input}
-        self.attrs = {'k': k}
+
+        if self.variable_k:
+            self.inputs['K'] = np.array([k]).astype("int32")
+        else:
+            self.attrs = {'k': k}
 
         for rowid in range(self.row):
             row = input[rowid]
@@ -38,6 +45,9 @@ class TestTopkOp(OpTest):
 
         self.outputs = {'Out': output, 'Indices': indices}
 
+    def init_dtype(self):
+        pass
+
     def set_args(self):
         self.row = 32
         self.top_k = 1
@@ -46,6 +56,11 @@ class TestTopkOp(OpTest):
         self.check_output()
 
 
+class TestTopkOpFp16(TestTopkOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestTopkOp3d(OpTest):
     def setUp(self):
         self.op_type = "top_k"
@@ -107,5 +122,12 @@ class TestTopkOp4(TestTopkOp):
         self.top_k = 1
 
 
+class TestTopkOp5(TestTopkOp):
+    def set_args(self):
+        self.row = 40000
+        self.top_k = 3
+        self.variable_k = True
+
+
 if __name__ == "__main__":
     unittest.main()