diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8a8f15a4fa..9b09a03f2b 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -48,10 +48,10 @@ if(WITH_GPU) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) add_dependencies(tensor tensor_util) else() - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context ) endif(WIN32) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context ) endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h index d9c76881b7..67054eccb3 100644 --- a/paddle/fluid/framework/attribute.h +++ b/paddle/fluid/framework/attribute.h @@ -165,7 +165,7 @@ template class GreaterThanChecker { public: explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} - void operator()(T& value) const { + void operator()(const T& value) const { PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails."); } @@ -177,7 +177,7 @@ template class EqualGreaterThanChecker { public: explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} - void operator()(T& value) const { + void operator()(const T& value) const { PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails."); } @@ -193,7 +193,7 @@ class DefaultValueSetter { public: explicit DefaultValueSetter(T default_value) : default_value_(default_value) {} - void operator()(T& value) const { value = default_value_; } // NOLINT + void operator()(T* value) const { *value = default_value_; } private: T default_value_; @@ -203,7 +203,7 @@ template class EnumInContainer { public: explicit EnumInContainer(const std::unordered_set& c) : container_(c) {} - void operator()(T& val) const { + void operator()(const T& val) const { PADDLE_ENFORCE(container_.find(val) != container_.end(), "Value %s is not in enum container %s", val, ContainerDebugString()); @@ -232,7 +232,8 @@ class EnumInContainer { // an attribute can have more than one limits template class TypedAttrChecker { - typedef std::function ValueChecker; + typedef std::function DefaultValueChecker; + typedef std::function ValueChecker; public: explicit TypedAttrChecker(const std::string& attr_name) @@ -268,17 +269,17 @@ class TypedAttrChecker { return *this; } - void operator()(AttributeMap& attr_map) const { // NOLINT - if (!attr_map.count(attr_name_)) { + void operator()(AttributeMap* attr_map) const { + if (!attr_map->count(attr_name_)) { // user do not set this attr PADDLE_ENFORCE(!default_value_setter_.empty(), "Attribute '%s' is required!", attr_name_); // default_value_setter_ has no more than one element T val; - (default_value_setter_[0])(val); - attr_map[attr_name_] = val; + (default_value_setter_[0])(&val); + (*attr_map)[attr_name_] = val; } - Attribute& attr = attr_map.at(attr_name_); + Attribute& attr = attr_map->at(attr_name_); ExtractAttribute extract_attr(attr_name_); T* attr_value = extract_attr(attr); for (const auto& checker : value_checkers_) { @@ -289,12 +290,12 @@ class TypedAttrChecker { private: std::string attr_name_; std::vector value_checkers_; - std::vector default_value_setter_; + std::vector default_value_setter_; }; // check whether op's all attributes fit their own limits class OpAttrChecker { - typedef std::function AttrChecker; + typedef std::function AttrChecker; public: template @@ -304,7 +305,7 @@ class OpAttrChecker { return *(checker.target>()); } - void Check(AttributeMap& attr_map) const { // NOLINT + void Check(AttributeMap* attr_map) const { for (const auto& checker : attr_checkers_) { checker(attr_map); } diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index a6d583777a..2cbfa24951 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -355,7 +355,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( BuildStrategy::GradientScaleStrategy::kCustomized) { // TODO(paddle-dev): Why is there no input for this op_handle? auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; - CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]); + auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType(); + CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0], + out_dtype); } // This assumes the backward generating code will ensure IsScaleLossOp // is true only for the op that scale the final scalar loss. @@ -658,14 +660,14 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID( void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node) const { + ir::Node *out_var_node, proto::VarType::Type dtype) const { size_t num_parallel_devices = Get("num_parallel_devices"); for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - num_parallel_devices, local_scopes_[i], places_[i], dev_ctx); + num_parallel_devices, local_scopes_[i], places_[i], dev_ctx, dtype); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 0556232aa4..5736102ddc 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void CreateScaleLossGradOp(ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node) const; + ir::Node *out_var_node, + proto::VarType::Type dtype) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index ef16265997..e1b8e8fe05 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -22,39 +22,66 @@ namespace details { ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, platform::Place place, - platform::DeviceContext *dev_ctx) + platform::DeviceContext *dev_ctx, + proto::VarType::Type dtype) : OpHandleBase(node), coeff_(static_cast(1.0 / num_dev)), scope_(scope), - place_(place) { + place_(place), + out_dtype_(dtype) { this->SetDeviceContext(place_, dev_ctx); } ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} +struct ScaleLossGradFunctor { + float coeff_; + Tensor *out_; + platform::Place place_; + OpHandleBase *op_handle_; + proto::VarType::Type out_dtype_; + platform::DeviceContext *ctx_; + + ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place, + OpHandleBase *op_handle, proto::VarType::Type dtype, + platform::DeviceContext *ctx) + : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {} + + template + void apply() const { + auto *out_data = out_->mutable_data(place_); + if (platform::is_cpu_place(place_)) { + *out_data = static_cast(coeff_); + } else { +#ifdef PADDLE_WITH_CUDA + OutT cast_coeff = static_cast(coeff_); + auto stream = static_cast(ctx_)->stream(); + memory::Copy(boost::get(place_), out_data, + platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_), + stream); + VLOG(10) << place_ << "RUN Scale loss grad op"; + +#endif + } + } +}; + void ScaleLossGradOpHandle::RunImpl() { // Doesn't wait any event std::string var_name = static_cast(this->outputs_[0])->name_; auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get(); - float *tmp = local_scope.FindVar(var_name) - ->GetMutable() - ->mutable_data(make_ddim({1}), place_); + auto *tensor = local_scope.FindVar(var_name)->GetMutable(); + tensor->Resize(make_ddim({1})); - if (platform::is_cpu_place(place_)) { - *tmp = coeff_; - } else { #ifdef PADDLE_WITH_CUDA - this->RunAndRecordEvent([&] { - auto stream = static_cast( - this->dev_ctxes_.at(place_)) - ->stream(); - memory::Copy(boost::get(place_), tmp, - platform::CPUPlace(), &coeff_, sizeof(float), stream); - VLOG(10) << place_ << "RUN Scale loss grad op"; - }); + ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, + this->dev_ctxes_.at(place_)); + this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); +#else + ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr); + framework::VisitDataType(out_dtype_, func); #endif - } } std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index 523b55724c..8bedd1643e 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -26,8 +26,8 @@ namespace details { struct ScaleLossGradOpHandle : public OpHandleBase { ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, - platform::Place place, - platform::DeviceContext *context); + platform::Place place, platform::DeviceContext *context, + proto::VarType::Type dtype); ~ScaleLossGradOpHandle() final; @@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase { float coeff_; Scope *scope_; platform::Place place_; + proto::VarType::Type out_dtype_; }; } // namespace details diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 2fe1c94ec0..0e7b0cbeb9 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() { // not by users. return; } - checker->Check(attrs_); + checker->Check(&attrs_); } void OpDesc::InferShape(const BlockDesc &block) const { diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index ca31303f77..2311614c33 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -82,6 +82,10 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") .SetDefault(""); + AddAttr>(OpCreationCallstackAttrName(), + "Callstack for Op Creatation.") + .SetDefault({}); + Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 4c59c73d87..0a0f8f4655 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -47,6 +47,7 @@ class OpProtoAndCheckerMaker { static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpNamescopeAttrName() { return "op_namescope"; } + static const char *OpCreationCallstackAttrName() { return "op_callstack"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc index bfc411ca2c..346d14d408 100644 --- a/paddle/fluid/framework/op_registry.cc +++ b/paddle/fluid/framework/op_registry.cc @@ -24,7 +24,7 @@ std::unique_ptr OpRegistry::CreateOp( const VariableNameMap& outputs, AttributeMap attrs) { auto& info = OpInfoMap::Instance().Get(type); if (info.Checker() != nullptr) { - info.Checker()->Check(attrs); + info.Checker()->Check(&attrs); } auto op = info.Creator()(type, inputs, outputs, attrs); return std::unique_ptr(op); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index fec311e3ee..ac2828136b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -16,10 +16,15 @@ limitations under the License. */ #include #include - +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/transfer_scope_cache.h" @@ -157,27 +162,59 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - VLOG(4) << place << " " << DebugStringEx(&scope); - if (platform::is_gpu_place(place)) { + try { + if (VLOG_IS_ON(4)) { + VLOG(4) << place << " " << DebugStringEx(&scope); + } + if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("Cannot run operator on place %s", place); + PADDLE_THROW("Cannot run operator on place %s", place); #else - auto dev_id = boost::get(place).device; - platform::SetDeviceId(dev_id); + auto dev_id = boost::get(place).device; + platform::SetDeviceId(dev_id); #endif - } + } - // The profile has a process-wide mutex, results in serious performance issue - // in concurrency scenerio. Here use an `if` to fix this issue. - // Please not remove the `if`, ask @Superjomn if there are any concern. - if (platform::IsProfileEnabled()) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); - } else { - RunImpl(scope, place); + // The profile has a process-wide mutex, results in serious performance + // issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + } else { + RunImpl(scope, place); + } + + if (VLOG_IS_ON(3)) { + VLOG(3) << place << " " << DebugStringEx(&scope); + } + } catch (platform::EnforceNotMet exception) { + if (Attrs().count("sub_block") != 0) { + throw exception; + } + + auto& callstack = Attr>( + OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + + if (callstack.empty()) { + throw exception; + } + std::ostringstream sout; + sout << "Invoke operator " << Type() << " error.\n"; + sout << "Python Callstacks: \n"; + for (auto& line : callstack) { + sout << line; + } + sout << "C++ Callstacks: \n"; + sout << exception.err_str_; + exception.err_str_ = sout.str(); + throw exception; + } catch (...) { + std::rethrow_exception(std::current_exception()); } - VLOG(3) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -1061,8 +1098,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { - PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s", - ipt_name, DebugString()); + PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized", + ipt_name); int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 5b09cad06c..ef096c2b81 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -28,8 +28,7 @@ void Tensor::check_memory_size() const { "or maybe the required data-type mismatches the data already stored."); } -Tensor::Tensor(std::type_index type) - : type_(framework::ToDataType(type)), offset_(0) {} +Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {} size_t Tensor::memory_size() const { return holder_ == nullptr ? 0UL : holder_->size() - offset_; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 2e110133a3..40606d9b06 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -69,7 +69,7 @@ class Tensor { public: Tensor() : type_(proto::VarType::FP32), offset_(0) {} - explicit Tensor(std::type_index type); + explicit Tensor(const proto::VarType::Type&); /*! Return a pointer to mutable memory block. */ template diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index cab6d9b67e..871c7bd2a7 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/temporary_allocator.h" namespace paddle { namespace framework { @@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector* dst) { src_ptr, size); } +template +paddle::framework::Tensor GetTensor( + memory::allocation::AllocationPtr temp_allocation_ptr, + const framework::DDim& dim) { + auto& deleter = temp_allocation_ptr.get_deleter(); + auto* allocation_ptr = temp_allocation_ptr.release(); + auto shared_allocation = + std::shared_ptr(allocation_ptr, deleter); + + PADDLE_ENFORCE( + dynamic_cast(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); + PADDLE_ENFORCE_EQ(allocation_ptr->size(), + framework::product(dim) * sizeof(T)); + + paddle::framework::Tensor temp_tensor( + framework::ToDataType(std::type_index(typeid(T)))); + temp_tensor.Resize(dim); + temp_tensor.ResetHolder(std::move(shared_allocation)); + return temp_tensor; +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 4a7b31c7d4..2519f5e7ac 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -18,11 +18,11 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/fluid/platform/create_tensor_with_allocationptr.h" namespace paddle { namespace operators { @@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel { auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( framework::product(col_shape) * sizeof(T)); - Tensor tep_tensor = - platform::GetTensor(std::move(tmp_allocation_ptr), col_shape); - - col.ShareDataWith(tep_tensor); + col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } @@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel { auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( framework::product(col_shape) * sizeof(T)); - Tensor tep_tensor = - platform::GetTensor(std::move(tmp_allocation_ptr), col_shape); - - col.ShareDataWith(tep_tensor); + col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 1a149298fd..ae669f5525 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_div, ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + elementwise_mul, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc index 9538177460..e80a703c30 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fill_zeros_like_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( @@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL( ops::FillZerosLikeKernel, ops::FillZerosLikeKernel, ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index b10a19b658..e925e7bb59 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -131,8 +131,9 @@ class ConcatFunctor { int in_col = input[0].numel() / in_row; int out_row = in_row, out_col = 0; - std::vector inputs_data(in_num); + std::vector inputs_data; std::vector inputs_col(in_num + 1); + inputs_data.reserve(in_num); inputs_col[0] = 0; bool sameShape = true; @@ -143,7 +144,7 @@ class ConcatFunctor { } out_col += t_cols; inputs_col[i + 1] = out_col; - inputs_data[i] = const_cast(input[i].data()); + inputs_data.emplace_back(input[i].data()); } // computation diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu index b255d2a7c4..4682940f7e 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ b/paddle/fluid/operators/metrics/accuracy_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/metrics/accuracy_op.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/gpu_info.h" namespace paddle { @@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { // FIXME(typhoonzero): types of T is for inference data. // label data is always int64 -REGISTER_OP_CUDA_KERNEL(accuracy, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + accuracy, paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/momentum_op.cu b/paddle/fluid/operators/optimizers/momentum_op.cu index 8ce739de8d..7f9e724640 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.cu +++ b/paddle/fluid/operators/optimizers/momentum_op.cu @@ -14,8 +14,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/optimizers/momentum_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( momentum, ops::MomentumOpKernel, - ops::MomentumOpKernel); + ops::MomentumOpKernel, + ops::MomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 71f079e4d9..f6ef83c3ba 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -237,7 +237,8 @@ class SparseMomentumFunctor { inline HOSTDEVICE void operator()(size_t i) { auto row_idx = math::BinarySearch(rows_, row_height_, i / row_numel_); - T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; + T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] + : static_cast(0); // put memory access in register const T p = p_[i]; const T lr = lr_[0]; @@ -282,7 +283,8 @@ class SparseMomentumFunctor { inline HOSTDEVICE void operator()(size_t i) { auto row_idx = math::BinarySearch(rows_, row_height_, i / row_numel_); - T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; + T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] + : static_cast(0); // put memory access in register const T p = p_[i]; const T lr = lr_[0]; diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h index 8fceed3558..57d6f4b3ea 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h @@ -52,7 +52,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker { "The maximum length of the sequence. If maxlen < 0, maxlen " "= max(Input(X)).") .SetDefault(-1) - .AddCustomChecker([](int &v) { + .AddCustomChecker([](const int &v) { PADDLE_ENFORCE(v < 0 || v >= 1, "Attr(maxlen) must be less than 0 or larger than 1"); }); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index c17d1afc30..9e77f7252d 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -21,7 +21,7 @@ class TopkOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of TopkOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -44,12 +44,25 @@ class TopkOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Indices"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context(), layout_, library_); + } }; class TopkOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor) The input of Topk op"); + AddInput("K", + "(Tensor) Number of top elements to look for along " + "the last dimension (along each row for matrices).") + .AsDispensable(); AddOutput("Out", "(Tensor) The output tensor of Topk op"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddComment(R"DOC( diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 0cad224ca8..c27039dd0a 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, if (k < MaxLength - (*beam)) { topk[k] = topk[k + *beam]; } else { - topk[k].set(-INFINITY, -1); + topk[k].set(-static_cast(INFINITY), -1); } } if (!(*is_empty)) { @@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, } *max = topk[MaxLength - 1]; - if ((*max).v == -1) *is_empty = true; + if ((*max).v == -static_cast(1)) *is_empty = true; *beam = 0; } } @@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, if (k < MaxLength - *beam) { topk[k] = topk[k + *beam]; } else { - topk[k].set(-INFINITY, -1); + topk[k].set(-static_cast(INFINITY), -1); } } if (!(*is_empty)) { @@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, bool firststep = true; for (int j = 0; j < MaxLength; j++) { - topk[j].set(-INFINITY, -1); + topk[j].set(-static_cast(INFINITY), -1); } while (top_num) { ThreadGetTopK( @@ -326,6 +327,17 @@ class TopkOpCUDAKernel : public framework::OpKernel { auto* indices = ctx.Output("Indices"); size_t k = static_cast(ctx.Attr("k")); + auto* k_t = ctx.Input("K"); + if (k_t) { + Tensor k_host; + framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); + k = k_host.data()[0]; + framework::DDim output_dims = output->dims(); + output_dims[output_dims.size() - 1] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } + const T* input_data = input->data(); T* output_data = output->mutable_data(ctx.GetPlace()); // FIXME(typhoonzero): data is always converted to type T? @@ -362,5 +374,7 @@ class TopkOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + top_k, paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 76ece57b39..f7bac67300 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -37,8 +37,16 @@ class TopkKernel : public framework::OpKernel { auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); auto* indices = ctx.Output("Indices"); - // k is determined by Attr - const size_t k = static_cast(ctx.Attr("k")); + + size_t k = static_cast(ctx.Attr("k")); + auto* k_t = ctx.Input("K"); + if (k_t) { + k = k_t->data()[0]; + framework::DDim output_dims = output->dims(); + output_dims[output_dims.size() - 1] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } T* output_data = output->mutable_data(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/platform/create_tensor_with_allocationptr.h b/paddle/fluid/platform/create_tensor_with_allocationptr.h deleted file mode 100644 index 00fcc5f862..0000000000 --- a/paddle/fluid/platform/create_tensor_with_allocationptr.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/temporary_allocator.h" -namespace paddle { -namespace platform { - -template -paddle::framework::Tensor GetTensor( - memory::allocation::AllocationPtr temp_allocation_ptr, - const framework::DDim &dim) { - auto &deleter = temp_allocation_ptr.get_deleter(); - auto *allocation_ptr = temp_allocation_ptr.release(); - auto shared_allocation = - std::shared_ptr(allocation_ptr, deleter); - - PADDLE_ENFORCE(dynamic_cast(allocation_ptr) != nullptr, - "The AllocationPtr must be TemporaryAllocation."); - PADDLE_ENFORCE_EQ(allocation_ptr->size(), - framework::product(dim) * sizeof(T)); - - paddle::framework::Tensor temp_tensor(std::type_index(typeid(T))); - temp_tensor.Resize(dim); - temp_tensor.ResetHolder(std::move(shared_allocation)); - return temp_tensor; -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 81c443d758..022afb686b 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device << ", CUDA Capability: " << compute_capability_ - << ", Driver Version: " << driver_version_ / 1000 + << ", Driver API Version: " << driver_version_ / 1000 << "." << (driver_version_ % 100) / 10 - << ", Runtime Version: " << runtime_version_ / 1000 - << "." << (runtime_version_ % 100) / 10; + << ", Runtime API Version: " + << runtime_version_ / 1000 << "." + << (runtime_version_ % 100) / 10; size_t cudnn_dso_ver = dynload::cudnnGetVersion(); LOG_FIRST_N(WARNING, 1) << "device: " << place_.device << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index af9744dcb8..7e87580189 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -41,7 +41,28 @@ limitations under the License. */ namespace paddle { namespace platform { -/*! \brief device temporary allocator singleton */ +/*! \brief device temporary allocator singleton. + * + * Some operator needs temporary memory during computation, for example, + * conv_gemm, which needs use col to store the result of im2col. If we + * create a stack memory which is used by CUDA Kernel, before the + * Computation(...) returns, we should add ctx->Wait(), because the + * execution of CUDA is async, if there doesn't have ctx->Wait(), + * the temporary memory will be released before the CUDA Kernel uses + * it. + * + * DeviceTemporaryAllocator is a singleton, which contains a + * `TemporaryAllocator` for each . And the TemporaryAllocator + * contains a temp_allocation_queue which is used to store the temporary + * allocations. The allocation, which is allocated by TemporaryAllocator, + * is a unique_ptr, and when it is not held by any variable, it will be + * pushed into the temp_allocation_queue. There are two opportunities to free + * the allocations of temp_allocation_queue: + * - when the Stream calls cudaStreamSynchronize; + * - when the allocation size of opportunities exceeds a certain threshold + * (defined by FLAGS_limit_of_temporary_allocation). + * + * */ class DeviceTemporaryAllocator { public: static DeviceTemporaryAllocator& Instance() { diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index e353658c7f..8df8e32098 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" #define NCCL_ID_VARNAME "NCCLID" @@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { return ncclInt; } else if (type == framework::proto::VarType::INT64) { return ncclInt64; + } else if (type == framework::proto::VarType::FP16) { + return ncclFloat16; } else { PADDLE_THROW("Not supported"); } diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index 4e32d2d695..812c4a3331 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation { memory::allocation::AllocationPtr underlying_allocation_; }; +/*! \brief the TemporaryAllocator is used to alloc the temporary allocation + * which used by CUDA's async operation. + * + * The TemporaryAllocator contains a temp_allocation_queue which + * is used to store the temporary allocations. The allocation, which is + * allocated by TemporaryAllocator, is a unique_ptr, and when it is not held + * by any variable, it will be pushed into the temp_allocation_queue. + * + * There is one opportunity to free the allocations of temp_allocation_queue: + * - when the allocation size of opportunities exceeds a certain threshold + * (defined by FLAGS_limit_of_temporary_allocation). + * + * */ class TemporaryAllocator : public memory::allocation::Allocator { public: explicit TemporaryAllocator(platform::Place place); diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index 3b940b0e82..e4e5be5b89 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -14,8 +14,7 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/create_tensor_with_allocationptr.h" +#include "paddle/fluid/framework/tensor_util.h" DECLARE_double(limit_of_temporary_allocation); namespace paddle { @@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) { TEST(temporary_allocator, add_callback) { #ifdef PADDLE_WITH_CUDA + const double limit = FLAGS_limit_of_temporary_allocation; FLAGS_limit_of_temporary_allocation = 10; platform::CUDAPlace gpu_place(0); TemporaryAllocator gpu_alloc(gpu_place); @@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) { }); { gpu_alloc.Allocate(100); } PADDLE_ENFORCE(deleted); - FLAGS_limit_of_temporary_allocation = -1; + FLAGS_limit_of_temporary_allocation = limit; #endif } @@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { auto allocation = cpu_alloc.Allocate(memory_size); void* address = allocation->ptr(); int numel = memory_size / sizeof(float); - framework::Tensor tensor = - GetTensor(std::move(allocation), framework::make_ddim({numel})); + framework::Tensor tensor = framework::GetTensor( + std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } @@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { auto allocation = gpu_alloc.Allocate(memory_size); void* address = allocation->ptr(); int numel = memory_size / sizeof(float); - framework::Tensor tensor = - GetTensor(std::move(allocation), framework::make_ddim({numel})); + framework::Tensor tensor = framework::GetTensor( + std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } @@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { { auto allocation = cpu_alloc.Allocate(memory_size); address = allocation->ptr(); - framework::Tensor tensor = GetTensor( + framework::Tensor tensor = framework::GetTensor( std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); @@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { { auto allocation = gpu_alloc.Allocate(memory_size); address = allocation->ptr(); - framework::Tensor tensor = GetTensor( + framework::Tensor tensor = framework::GetTensor( std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 06d8b65fb1..f8ded9f94e 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -49,6 +49,9 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpNameScopeAttrName", framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); + op_proto_and_checker_maker.def( + "kOpCreationCallstackAttrName", + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName); } } // namespace pybind diff --git a/paddle/scripts/installation_validate.py b/paddle/scripts/installation_validate.py new file mode 100644 index 0000000000..f84e2f4b17 --- /dev/null +++ b/paddle/scripts/installation_validate.py @@ -0,0 +1,18 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle as pd + +print(pd.__version__) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 2e6b40148d..418dc13468 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -79,6 +79,7 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib" + pip install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 fi @@ -91,6 +92,7 @@ function cmake_gen() { -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 fi @@ -103,6 +105,7 @@ function cmake_gen() { -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 fi @@ -115,6 +118,7 @@ function cmake_gen() { -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 fi @@ -441,7 +445,9 @@ EOF # make install should also be test when unittest make install -j 8 if [ "$1" == "cp27-cp27m" ]; then + set -e pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + python ${PADDLE_ROOT}/paddle/scripts/installation_validate.py elif [ "$1" == "cp35-cp35m" ]; then pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl elif [ "$1" == "cp36-cp36m" ]; then diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 13d2893fd1..af02721eb7 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object): self.dtype = 'int64' elif dtype == core.VarDesc.VarType.FP64: self.dtype = 'float64' + elif dtype == core.VarDesc.VarType.FP16: + self.dtype = 'float16' elif dtype == core.VarDesc.VarType.INT32: self.dtype = 'int32' elif dtype == core.VarDesc.VarType.UINT8: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index de30ed2fc5..3427fb0c4a 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -20,6 +20,7 @@ import os import re import six import sys +import traceback import numpy as np @@ -604,6 +605,10 @@ class Operator(object): if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: del op_attrs[role_var_name] + callstack_var_name = op_maker.kOpCreationCallstackAttrName() + op_attrs[callstack_var_name] = list( + reversed(traceback.format_stack()))[1:] + if len(self.desc.type()) != 0: return if type is None: diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index b37ebbe517..26d1f8f4d2 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -18,6 +18,7 @@ from . import framework import numpy as np import contextlib from .core import VarDesc +from . import unique_name __all__ = [ 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', @@ -207,16 +208,39 @@ class UniformInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + op = block._prepend_op( type="uniform_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": int(var.dtype), + "dtype": out_dtype, "min": self._low, "max": self._high, "seed": self._seed }) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + var.op = op return op @@ -261,17 +285,39 @@ class NormalInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + op = block._prepend_op( type="gaussian_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": int(var.dtype), + "dtype": out_dtype, "mean": self._mean, "std": self._std_dev, "seed": self._seed, "use_mkldnn": False }) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) var.op = op return op diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bdfcc8c4e2..cc1fdbd285 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2801,6 +2801,10 @@ def batch_norm(input, helper = LayerHelper('batch_norm', **locals()) dtype = helper.input_dtype() + # use fp32 for bn parameter + if dtype == core.VarDesc.VarType.FP16: + dtype = core.VarDesc.VarType.FP32 + input_shape = input.shape if data_layout == 'NCHW': channel_num = input_shape[1] @@ -2835,7 +2839,7 @@ def batch_norm(input, trainable=False, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, - dtype=input.dtype) + dtype=dtype) mean.stop_gradient = True variance = helper.create_parameter( @@ -2845,7 +2849,7 @@ def batch_norm(input, trainable=False, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, - dtype=input.dtype) + dtype=dtype) variance.stop_gradient = True # create output @@ -4526,7 +4530,7 @@ def topk(input, k, name=None): Args: input(Variable): The input variable which can be a vector or Tensor with higher rank. - k(int): The number of top elements to look for along the last dimension + k(int | Variable): The number of top elements to look for along the last dimension of input. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -4549,12 +4553,18 @@ def topk(input, k, name=None): helper = LayerHelper("top_k", **locals()) values = helper.create_variable_for_type_inference(dtype=input.dtype) indices = helper.create_variable_for_type_inference(dtype="int64") + inputs = {"X": [input]} + attrs = None + if isinstance(k, Variable): + inputs['K'] = k + else: + attrs = {'k': k} helper.append_op( type="top_k", - inputs={"X": [input]}, + inputs=inputs, outputs={"Out": [values], "Indices": [indices]}, - attrs={"k": k}) + attrs=attrs) values.stop_gradient = True indices.stop_gradient = True return values, indices @@ -7943,7 +7953,7 @@ def unstack(x, axis=0, num=None): num = x.shape[axis] outs = [] - for _ in num: + for _ in range(num): outs.append(helper.create_variable_for_type_inference(x.dtype)) helper.append_op( diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 76a707efdc..0fe836683b 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -368,6 +368,8 @@ class OpTest(unittest.TestCase): place = core.CUDAPlace(0) if core.is_float16_supported(place): return [place] + else: + return [] else: return [] places = [fluid.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py index 1b2b53f2d4..5257b0be6f 100644 --- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py +++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py @@ -22,8 +22,10 @@ from op_test import OpTest class TestAccuracyOp(OpTest): def setUp(self): self.op_type = "accuracy" + self.dtype = np.float32 + self.init_dtype() n = 8192 - infer = np.random.random((n, 1)).astype("float32") + infer = np.random.random((n, 1)).astype(self.dtype) indices = np.random.randint(0, 2, (n, 1)) label = np.random.randint(0, 2, (n, 1)) self.inputs = {'Out': infer, 'Indices': indices, "Label": label} @@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest): num_correct += 1 break self.outputs = { - 'Accuracy': np.array([num_correct / float(n)]).astype("float32"), + 'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype), 'Correct': np.array([num_correct]).astype("int32"), 'Total': np.array([n]).astype("int32") } + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestAccuracyOpFp16(TestAccuracyOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-3) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index cadaf1df53..15d4db590e 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -21,14 +21,16 @@ from op_test import OpTest class ElementwiseDivOp(OpTest): def setUp(self): self.op_type = "elementwise_div" + self.dtype = np.float32 + self.init_dtype() """ Warning CPU gradient check error! 'X': np.random.random((32,84)).astype("float32"), 'Y': np.random.random((32,84)).astype("float32") """ self.inputs = { - 'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), - 'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") + 'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype), + 'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) } self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} @@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest): self.check_grad( ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y')) + def init_dtype(self): + pass + class TestElementwiseDivOp_scalar(ElementwiseDivOp): def setUp(self): @@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp): } +class TestElementwiseDivOpFp16(ElementwiseDivOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=1) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y')) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 57ba34f833..0484099188 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp): } +class TestElementwiseMulOpFp16(ElementwiseMulOp): + def init_dtype(self): + self.dtype = np.float16 + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py index eec73d0beb..20f1a110c3 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py @@ -22,12 +22,22 @@ from op_test import OpTest class TestFillZerosLikeOp(OpTest): def setUp(self): self.op_type = "fill_zeros_like" - self.inputs = {'X': np.random.random((219, 232)).astype("float32")} + self.dtype = np.float32 + self.init_dtype() + self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)} self.outputs = {'Out': np.zeros_like(self.inputs["X"])} + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestFillZerosLikeOpFp16(TestFillZerosLikeOp): + def init_dtype(self): + self.dtype = np.float16 + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index cf4346cf2e..77ec6f9b6b 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -24,11 +24,13 @@ from op_test import OpTest class TestMomentumOp1(OpTest): def setUp(self): self.op_type = "momentum" + self.dtype = np.float32 + self.init_dtype() - param = np.random.random((123, 321)).astype("float32") - grad = np.random.random((123, 321)).astype("float32") - velocity = np.zeros((123, 321)).astype("float32") - learning_rate = np.array([0.001]).astype("float32") + param = np.random.random((123, 321)).astype(self.dtype) + grad = np.random.random((123, 321)).astype(self.dtype) + velocity = np.zeros((123, 321)).astype(self.dtype) + learning_rate = np.array([0.001]).astype(self.dtype) mu = 0.0001 use_nesterov = False @@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest): self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestMomentumOpFp16(TestMomentumOp1): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-3) + + class TestMomentumOp2(OpTest): '''Test Momentum with default values for attributes ''' diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 4153394c1d..37b9a9188a 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase): set(mul_op.attr_names), set([ "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", - "op_namescope" + "op_namescope", "op_callstack" ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py index 69b29db83a..9fbf59ed66 100644 --- a/python/paddle/fluid/tests/unittests/test_top_k_op.py +++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py @@ -21,15 +21,22 @@ from op_test import OpTest class TestTopkOp(OpTest): def setUp(self): + self.variable_k = False self.set_args() self.op_type = "top_k" + self.dtype = np.float32 + self.init_dtype() + k = self.top_k - input = np.random.random((self.row, k)).astype("float32") + input = np.random.random((self.row, k)).astype(self.dtype) output = np.ndarray((self.row, k)) indices = np.ndarray((self.row, k)).astype("int64") - self.inputs = {'X': input} - self.attrs = {'k': k} + + if self.variable_k: + self.inputs['K'] = np.array([k]).astype("int32") + else: + self.attrs = {'k': k} for rowid in range(self.row): row = input[rowid] @@ -38,6 +45,9 @@ class TestTopkOp(OpTest): self.outputs = {'Out': output, 'Indices': indices} + def init_dtype(self): + pass + def set_args(self): self.row = 32 self.top_k = 1 @@ -46,6 +56,11 @@ class TestTopkOp(OpTest): self.check_output() +class TestTopkOpFp16(TestTopkOp): + def init_dtype(self): + self.dtype = np.float16 + + class TestTopkOp3d(OpTest): def setUp(self): self.op_type = "top_k" @@ -107,5 +122,12 @@ class TestTopkOp4(TestTopkOp): self.top_k = 1 +class TestTopkOp5(TestTopkOp): + def set_args(self): + self.row = 40000 + self.top_k = 3 + self.variable_k = True + + if __name__ == "__main__": unittest.main()