Merge branch 'develop' of github.com:PaddlePaddle/Paddle into parallel_graph_mode

revert-15207-remove_op_handle_lock_and_fix_var
Yancey1989 7 years ago
commit 86bb583881

@ -48,10 +48,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
endif(WIN32)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)

@ -165,7 +165,7 @@ template <typename T>
class GreaterThanChecker {
public:
explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
void operator()(const T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
}
@ -177,7 +177,7 @@ template <typename T>
class EqualGreaterThanChecker {
public:
explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
void operator()(const T& value) const {
PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
}
@ -193,7 +193,7 @@ class DefaultValueSetter {
public:
explicit DefaultValueSetter(T default_value)
: default_value_(default_value) {}
void operator()(T& value) const { value = default_value_; } // NOLINT
void operator()(T* value) const { *value = default_value_; }
private:
T default_value_;
@ -203,7 +203,7 @@ template <typename T>
class EnumInContainer {
public:
explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
void operator()(T& val) const {
void operator()(const T& val) const {
PADDLE_ENFORCE(container_.find(val) != container_.end(),
"Value %s is not in enum container %s", val,
ContainerDebugString());
@ -232,7 +232,8 @@ class EnumInContainer {
// an attribute can have more than one limits
template <typename T>
class TypedAttrChecker {
typedef std::function<void(T&)> ValueChecker;
typedef std::function<void(T*)> DefaultValueChecker;
typedef std::function<void(const T&)> ValueChecker;
public:
explicit TypedAttrChecker(const std::string& attr_name)
@ -268,17 +269,17 @@ class TypedAttrChecker {
return *this;
}
void operator()(AttributeMap& attr_map) const { // NOLINT
if (!attr_map.count(attr_name_)) {
void operator()(AttributeMap* attr_map) const {
if (!attr_map->count(attr_name_)) {
// user do not set this attr
PADDLE_ENFORCE(!default_value_setter_.empty(),
"Attribute '%s' is required!", attr_name_);
// default_value_setter_ has no more than one element
T val;
(default_value_setter_[0])(val);
attr_map[attr_name_] = val;
(default_value_setter_[0])(&val);
(*attr_map)[attr_name_] = val;
}
Attribute& attr = attr_map.at(attr_name_);
Attribute& attr = attr_map->at(attr_name_);
ExtractAttribute<T> extract_attr(attr_name_);
T* attr_value = extract_attr(attr);
for (const auto& checker : value_checkers_) {
@ -289,12 +290,12 @@ class TypedAttrChecker {
private:
std::string attr_name_;
std::vector<ValueChecker> value_checkers_;
std::vector<ValueChecker> default_value_setter_;
std::vector<DefaultValueChecker> default_value_setter_;
};
// check whether op's all attributes fit their own limits
class OpAttrChecker {
typedef std::function<void(AttributeMap&)> AttrChecker;
typedef std::function<void(AttributeMap*)> AttrChecker;
public:
template <typename T>
@ -304,7 +305,7 @@ class OpAttrChecker {
return *(checker.target<TypedAttrChecker<T>>());
}
void Check(AttributeMap& attr_map) const { // NOLINT
void Check(AttributeMap* attr_map) const {
for (const auto& checker : attr_checkers_) {
checker(attr_map);
}

@ -355,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
BuildStrategy::GradientScaleStrategy::kCustomized) {
// TODO(paddle-dev): Why is there no input for this op_handle?
auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
out_dtype);
}
// This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss.
@ -658,14 +660,14 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
ir::Graph *result, const std::string &loss_grad_name,
ir::Node *out_var_node) const {
ir::Node *out_var_node, proto::VarType::Type dtype) const {
size_t num_parallel_devices = Get<size_t>("num_parallel_devices");
for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
auto *op_handle = new ScaleLossGradOpHandle(
result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
num_parallel_devices, local_scopes_[i], places_[i], dev_ctx);
num_parallel_devices, local_scopes_[i], places_[i], dev_ctx, dtype);
result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale

@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name,
ir::Node *out_var_node) const;
ir::Node *out_var_node,
proto::VarType::Type dtype) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const;

@ -22,39 +22,66 @@ namespace details {
ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
Scope *scope,
platform::Place place,
platform::DeviceContext *dev_ctx)
platform::DeviceContext *dev_ctx,
proto::VarType::Type dtype)
: OpHandleBase(node),
coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope),
place_(place) {
place_(place),
out_dtype_(dtype) {
this->SetDeviceContext(place_, dev_ctx);
}
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
struct ScaleLossGradFunctor {
float coeff_;
Tensor *out_;
platform::Place place_;
OpHandleBase *op_handle_;
proto::VarType::Type out_dtype_;
platform::DeviceContext *ctx_;
ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
OpHandleBase *op_handle, proto::VarType::Type dtype,
platform::DeviceContext *ctx)
: coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
template <typename OutT>
void apply() const {
auto *out_data = out_->mutable_data<OutT>(place_);
if (platform::is_cpu_place(place_)) {
*out_data = static_cast<OutT>(coeff_);
} else {
#ifdef PADDLE_WITH_CUDA
OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
stream);
VLOG(10) << place_ << "RUN Scale loss grad op";
#endif
}
}
};
void ScaleLossGradOpHandle::RunImpl() {
// Doesn't wait any event
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
float *tmp = local_scope.FindVar(var_name)
->GetMutable<LoDTensor>()
->mutable_data<float>(make_ddim({1}), place_);
auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
tensor->Resize(make_ddim({1}));
if (platform::is_cpu_place(place_)) {
*tmp = coeff_;
} else {
#ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] {
auto stream = static_cast<platform::CUDADeviceContext *>(
this->dev_ctxes_.at(place_))
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(10) << place_ << "RUN Scale loss grad op";
});
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
#else
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
framework::VisitDataType(out_dtype_, func);
#endif
}
}
std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }

@ -26,8 +26,8 @@ namespace details {
struct ScaleLossGradOpHandle : public OpHandleBase {
ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
platform::Place place,
platform::DeviceContext *context);
platform::Place place, platform::DeviceContext *context,
proto::VarType::Type dtype);
~ScaleLossGradOpHandle() final;
@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
float coeff_;
Scope *scope_;
platform::Place place_;
proto::VarType::Type out_dtype_;
};
} // namespace details

@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() {
// not by users.
return;
}
checker->Check(attrs_);
checker->Check(&attrs_);
}
void OpDesc::InferShape(const BlockDesc &block) const {

@ -82,6 +82,10 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
.SetDefault("");
AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
"Callstack for Op Creatation.")
.SetDefault({});
Validate();
}

@ -47,6 +47,7 @@ class OpProtoAndCheckerMaker {
static const char *OpRoleAttrName() { return "op_role"; }
static const char *OpRoleVarAttrName() { return "op_role_var"; }
static const char *OpNamescopeAttrName() { return "op_namescope"; }
static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);

@ -24,7 +24,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
const VariableNameMap& outputs, AttributeMap attrs) {
auto& info = OpInfoMap::Instance().Get(type);
if (info.Checker() != nullptr) {
info.Checker()->Check(attrs);
info.Checker()->Check(&attrs);
}
auto op = info.Creator()(type, inputs, outputs, attrs);
return std::unique_ptr<OperatorBase>(op);

@ -16,10 +16,15 @@ limitations under the License. */
#include <glog/logging.h>
#include <algorithm>
#include <sstream>
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
@ -157,27 +162,59 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
}
void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) {
try {
if (VLOG_IS_ON(4)) {
VLOG(4) << place << " " << DebugStringEx(&scope);
}
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place);
PADDLE_THROW("Cannot run operator on place %s", place);
#else
auto dev_id = boost::get<platform::CUDAPlace>(place).device;
platform::SetDeviceId(dev_id);
auto dev_id = boost::get<platform::CUDAPlace>(place).device;
platform::SetDeviceId(dev_id);
#endif
}
}
// The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
} else {
RunImpl(scope, place);
// The profile has a process-wide mutex, results in serious performance
// issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
} else {
RunImpl(scope, place);
}
if (VLOG_IS_ON(3)) {
VLOG(3) << place << " " << DebugStringEx(&scope);
}
} catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) {
throw exception;
}
auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) {
throw exception;
}
std::ostringstream sout;
sout << "Invoke operator " << Type() << " error.\n";
sout << "Python Callstacks: \n";
for (auto& line : callstack) {
sout << line;
}
sout << "C++ Callstacks: \n";
sout << exception.err_str_;
exception.err_str_ = sout.str();
throw exception;
} catch (...) {
std::rethrow_exception(std::current_exception());
}
VLOG(3) << place << " " << DebugStringEx(&scope);
}
bool OperatorBase::HasInputs(const std::string& name) const {
@ -1061,8 +1098,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t = &(var->Get<SelectedRows>().value());
}
if (t != nullptr) {
PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
ipt_name, DebugString());
PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
ipt_name);
int tmp = static_cast<int>(t->type());
PADDLE_ENFORCE(
tmp == data_type || data_type == -1,

@ -28,8 +28,7 @@ void Tensor::check_memory_size() const {
"or maybe the required data-type mismatches the data already stored.");
}
Tensor::Tensor(std::type_index type)
: type_(framework::ToDataType(type)), offset_(0) {}
Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
size_t Tensor::memory_size() const {
return holder_ == nullptr ? 0UL : holder_->size() - offset_;

@ -69,7 +69,7 @@ class Tensor {
public:
Tensor() : type_(proto::VarType::FP32), offset_(0) {}
explicit Tensor(std::type_index type);
explicit Tensor(const proto::VarType::Type&);
/*! Return a pointer to mutable memory block. */
template <typename T>

@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/temporary_allocator.h"
namespace paddle {
namespace framework {
@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
src_ptr, size);
}
template <typename T>
paddle::framework::Tensor GetTensor(
memory::allocation::AllocationPtr temp_allocation_ptr,
const framework::DDim& dim) {
auto& deleter = temp_allocation_ptr.get_deleter();
auto* allocation_ptr = temp_allocation_ptr.release();
auto shared_allocation =
std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
paddle::framework::Tensor temp_tensor(
framework::ToDataType(std::type_index(typeid(T))));
temp_tensor.Resize(dim);
temp_tensor.ResetHolder(std::move(shared_allocation));
return temp_tensor;
}
} // namespace framework
} // namespace paddle

@ -18,11 +18,11 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
namespace paddle {
namespace operators {
@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
auto tmp_allocation_ptr =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
framework::product(col_shape) * sizeof(T));
Tensor tep_tensor =
platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col.ShareDataWith(tep_tensor);
col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
auto tmp_allocation_ptr =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
framework::product(col_shape) * sizeof(T));
Tensor tep_tensor =
platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col.ShareDataWith(tep_tensor);
col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}

@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,

@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_mul,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
elementwise_mul, ops::ElementwiseMulKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseMulKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
elementwise_mul_grad,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);

@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fill_zeros_like_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);

@ -131,8 +131,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
int in_col = input[0].numel() / in_row;
int out_row = in_row, out_col = 0;
std::vector<T*> inputs_data(in_num);
std::vector<const T*> inputs_data;
std::vector<int> inputs_col(in_num + 1);
inputs_data.reserve(in_num);
inputs_col[0] = 0;
bool sameShape = true;
@ -143,7 +144,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
}
out_col += t_cols;
inputs_col[i + 1] = out_col;
inputs_data[i] = const_cast<T*>(input[i].data<T>());
inputs_data.emplace_back(input[i].data<T>());
}
// computation

@ -16,6 +16,7 @@ limitations under the License. */
#include <thrust/reduce.h>
#include "paddle/fluid/operators/metrics/accuracy_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
// FIXME(typhoonzero): types of T is for inference data.
// label data is always int64
REGISTER_OP_CUDA_KERNEL(accuracy,
paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(
accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>,
paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);

@ -14,8 +14,11 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/optimizers/momentum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>);
ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::MomentumOpKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);

@ -237,7 +237,8 @@ class SparseMomentumFunctor<T, UseNesterov> {
inline HOSTDEVICE void operator()(size_t i) {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
: static_cast<T>(0);
// put memory access in register
const T p = p_[i];
const T lr = lr_[0];
@ -282,7 +283,8 @@ class SparseMomentumFunctor<T, NoNesterov> {
inline HOSTDEVICE void operator()(size_t i) {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
: static_cast<T>(0);
// put memory access in register
const T p = p_[i];
const T lr = lr_[0];

@ -52,7 +52,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
"The maximum length of the sequence. If maxlen < 0, maxlen "
"= max(Input(X)).")
.SetDefault(-1)
.AddCustomChecker([](int &v) {
.AddCustomChecker([](const int &v) {
PADDLE_ENFORCE(v < 0 || v >= 1,
"Attr(maxlen) must be less than 0 or larger than 1");
});

@ -21,7 +21,7 @@ class TopkOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of TopkOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
@ -44,12 +44,25 @@ class TopkOp : public framework::OperatorWithKernel {
ctx->ShareLoD("X", "Out");
ctx->ShareLoD("X", "Indices");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context(), layout_, library_);
}
};
class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) The input of Topk op");
AddInput("K",
"(Tensor) Number of top elements to look for along "
"the last dimension (along each row for matrices).")
.AsDispensable();
AddOutput("Out", "(Tensor) The output tensor of Topk op");
AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
AddComment(R"DOC(

@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-INFINITY, -1);
topk[k].set(-static_cast<T>(INFINITY), -1);
}
}
if (!(*is_empty)) {
@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
}
*max = topk[MaxLength - 1];
if ((*max).v == -1) *is_empty = true;
if ((*max).v == -static_cast<T>(1)) *is_empty = true;
*beam = 0;
}
}
@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - *beam) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-INFINITY, -1);
topk[k].set(-static_cast<T>(INFINITY), -1);
}
}
if (!(*is_empty)) {
@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool firststep = true;
for (int j = 0; j < MaxLength; j++) {
topk[j].set(-INFINITY, -1);
topk[j].set(-static_cast<T>(INFINITY), -1);
}
while (top_num) {
ThreadGetTopK<T, MaxLength, BlockSize>(
@ -326,6 +327,17 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
auto* indices = ctx.Output<Tensor>("Indices");
size_t k = static_cast<int>(ctx.Attr<int>("k"));
auto* k_t = ctx.Input<Tensor>("K");
if (k_t) {
Tensor k_host;
framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
k = k_host.data<int>()[0];
framework::DDim output_dims = output->dims();
output_dims[output_dims.size() - 1] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
// FIXME(typhoonzero): data is always converted to type T?
@ -362,5 +374,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(
top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save