Merge remote-tracking branch 'ups/develop' into fea/jit/gen

fix_recordio_link
tensor-tang 6 years ago
commit f3badacd97

@ -68,7 +68,6 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(WITH_INFERENCE "Compile fluid inference library" ON)
option(ON_INFER "Turn on inference optimization." OFF)
option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)

@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
endif()
add_subdirectory(testing)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
add_subdirectory(fluid)
endif()

@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
@ -177,6 +177,8 @@ paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, k
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)

@ -9,8 +9,6 @@ add_subdirectory(pybind)
add_subdirectory(recordio)
endif(NOT WIN32)
if(WITH_INFERENCE)
# NOTE: please add subdirectory inference at last.
add_subdirectory(inference)
add_subdirectory(train)
endif()
# NOTE: please add subdirectory inference at last.
add_subdirectory(inference)
add_subdirectory(train)

@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
nccl_ctxs_(ctxs) {
if (nccl_ctxs_) {
for (auto &p : places_) {
this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
}
}
}
@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1;
@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i];
auto *var = scope.FindVar(out_var_handles[i]->name_);
auto *dev_ctx = dev_ctxes_[p];
auto *dev_ctx = dev_ctxes_.at(p);
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();

@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
p_ctx.second.ctx_.get());
}
}
}

@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
bool need_wait =
in_var && in_var->GeneratedOp() &&
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
return need_wait;
}

@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
if (ctxs) {
for (auto &p : places_) {
this->dev_ctxes_[p] = ctxs->DevCtx(p);
this->SetDeviceContext(p, ctxs->DevCtx(p));
}
}
}
@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1,
"Data balance can only be enabled when the number of "
"places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),

@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
VarHandle *out_var_handle;
{
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
out_var_handle = out_var_handles.front();
@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
Tensor *out_tensor = out_value->mutable_value();
// copy
auto dev_ctx = dev_ctxes_[out_var_handle->place_];
auto dev_ctx = dev_ctxes_.at(out_var_handle->place_);
RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
t_out_p] {
int s = 0, e = 0;

@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
for (auto *in : inputs_) {
if (NeedWait(in)) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
}
}
}

@ -27,7 +27,7 @@ namespace framework {
namespace details {
void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
if (places_.size() == 1) return;
// the input and output may have dummy var.

@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
p_ctx.second.ctx_.get());
}
}
}

@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
continue;
}
if (in->GeneratedOp()) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
}
}
auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();

@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope),
place_(place) {
dev_ctxes_[place_] = dev_ctx;
this->SetDeviceContext(place_, dev_ctx);
}
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
@ -46,8 +46,8 @@ void ScaleLossGradOpHandle::RunImpl() {
} else {
#ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] {
auto stream =
static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
auto stream = static_cast<platform::CUDADeviceContext *>(
this->dev_ctxes_.at(place_))
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);

@ -19,81 +19,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
// directly, because there are many vector<LoDTensor> used accross the project,
// and some of them are treated as LoDTensorArray.
#if !defined(PADDLE_ON_INFERENCE)
using LoDTensorArray = std::vector<LoDTensor>;
#else // !PADDLE_ON_INFERENCE
#pragma message "LoDTensorArray is replaced with the inference one."
/*
* A LoDTensorArray which will not deallocate buffer when resized, fix the data
* diff in inference, and more performance friendly in the concurrency
* scenerios.
*/
class LoDTensorArray {
public:
LoDTensorArray() = default;
using iterator = std::vector<LoDTensor>::iterator;
using const_iterator = std::vector<LoDTensor>::const_iterator;
const_iterator begin() const { return array_.begin(); }
const_iterator end() const { return array_.begin() + size_; }
iterator begin() { return array_.begin(); }
iterator end() { return array_.begin() + size_; }
void push_back(const LoDTensor& x) {
if (size_ < array_.size()) {
array_[size_++] = x;
} else {
array_.push_back(x);
++size_;
}
}
void resize(size_t size) {
if (array_.size() < size) {
array_.resize(size);
}
size_ = size;
}
void emplace_back() { array_.emplace_back(); }
void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); }
LoDTensor& back() { return array_.back(); }
size_t space() const { return array_.size(); }
void reserve(size_t size) {
// Naive warning to tell user this array might be to large. The memory and
// buffer used by this TensorArray will not be deleted during the training
// and inference phase, so attention not to make it expand too long.
if (size > 800UL) {
LOG(WARNING) << "TensorArray has more than 800 items";
}
array_.reserve(size);
}
bool empty() const { return size_ == 0UL; }
void clear() { size_ = 0UL; }
LoDTensor& operator[](size_t id) { return array_[id]; }
const LoDTensor& operator[](size_t id) const { return array_[id]; }
LoDTensor& at(size_t id) { return array_.at(id); }
const LoDTensor& at(size_t id) const { return array_.at(id); }
size_t size() const { return size_; }
private:
size_t size_{0};
std::vector<LoDTensor> array_;
};
#endif // !PADDLE_ON_INFERENCE
} // namespace framework
} // namespace paddle

@ -358,7 +358,7 @@ static bool VarIsTensor(const Variable* var) {
return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
}
static const Tensor* GetTensorFromVar(Variable* var) {
const Tensor* GetTensorFromVar(Variable* var) {
if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>();
} else if (var->IsType<SelectedRows>()) {

@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
}
proto::VarType::Type GetDataTypeOfVar(const Variable* var);
const Tensor* GetTensorFromVar(Variable* var);
class OperatorBase;
class ExecutionContext;

@ -303,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
ParallelExecutor::~ParallelExecutor() {
const auto dev_ctxs =
platform::DeviceContextPool::Instance().GetAllDeviceContexts();
for (auto &dev_ctx : dev_ctxs) {
dev_ctx->Wait();
for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
if (member_->own_local_scope_) {

@ -61,8 +61,6 @@ cc_test(test_paddle_inference_api
inference_api_test(test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book)

@ -22,9 +22,9 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h"
#ifdef __clang__
#define ACC_DIFF 4e-3
#define ACC_DIFF 4e-2
#else
#define ACC_DIFF 1e-3
#define ACC_DIFF 1e-2
#endif
DEFINE_string(dirname, "", "Directory of the inference model.");
@ -187,7 +187,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() {
auto predictor = main_predictor->Clone();
auto predictor = CreatePaddlePredictor(config);
auto& local_inputs = paddle_tensor_feeds[tid];
std::vector<PaddleTensor> local_outputs;
ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
@ -245,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) {
std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() {
auto predictor = main_predictor->Clone();
auto predictor = CreatePaddlePredictor(config);
auto& local_inputs = paddle_tensor_feeds[tid];
std::vector<PaddleTensor> local_outputs;
ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));

@ -70,8 +70,12 @@ void Main(bool use_gpu) {
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
i++) {
CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
0.001);
// Here will result random fail, for that the model is trained by CI, the
// train phase is not stable, so the result will be random.
// TODO(Superjomn) will restore after the model is upload.
// CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
// result[i],
// 0.001);
}
}
}

@ -0,0 +1,97 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/add_position_encoding_op.h"
namespace paddle {
namespace operators {
class AddPositionEncodingOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"X(Input) of add_position_encoding_op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Out(Output) of add_position_encoding_op should not be null.");
auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Out@GRAD must not be null.");
auto out_dims = ctx->GetInputDim("Out");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
}
}
};
class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of AddPositionEncoding operator");
AddOutput("Out", "Output of AddPositionEncoding operator");
AddAttr<float>("alpha", "The scale of Original Embedding.")
.SetDefault(1.0f)
.AddCustomChecker([](const float& alpha) {
PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
});
AddAttr<float>("beta", "The scale of Position Embedding.")
.SetDefault(1.0f)
.AddCustomChecker([](const float& beta) {
PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
});
AddComment(R"DOC(
Add Position Encoding Operator.
The add position encoding calculates the output based on the input, alpha, beta.
The size of each dimension of the parameters checked in the infer-shape.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plt = paddle::platform;
REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
ops::AddPositionEncodingOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
REGISTER_OP_CPU_KERNEL(
add_position_encoding,
ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
add_position_encoding_grad,
ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);

@ -0,0 +1,105 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AddPositionEncodingKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<framework::LoDTensor>("X");
auto& x_lod = X->lod();
auto* src_ptr = X->data<T>();
auto* Out = context.Output<framework::LoDTensor>("Out");
auto* dst_ptr = Out->mutable_data<T>(context.GetPlace());
float alpha = context.Attr<float>("alpha");
float beta = context.Attr<float>("beta");
auto x_dim = X->dims();
int batch_size = 0;
int max_seq_len = 0;
int enc_size = 0;
if (x_lod.empty()) {
PADDLE_ENFORCE(
x_dim.size() == 3UL,
"The input X of Add Position Encoding should be 3-D Tensor!");
batch_size = x_dim[0];
max_seq_len = x_dim[1];
enc_size = x_dim[2];
} else {
PADDLE_ENFORCE(
x_dim.size() == 2UL,
"The input X of Add Position Encoding should be 2-D LoDTensor!");
PADDLE_ENFORCE(
x_lod.size() == 1UL,
"The Add Position Encoding Op only supports lod_level == 1!");
batch_size = x_lod[0].size() - 1;
max_seq_len = -1;
enc_size = x_dim[1];
}
PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!");
const int half_size = enc_size / 2;
for (int i = 0; i < batch_size; ++i) {
const int max_length =
x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
for (int j = 0; j < max_length; ++j) {
for (int k = 0; k < half_size; ++k) {
const double val = (half_size > 1)
? j / pow(10000.0, double(k) / (half_size - 1))
: j / 10000.0;
dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
dst_ptr[half_size + k] =
src_ptr[half_size + k] * alpha + cos(val) * beta;
}
src_ptr += enc_size;
dst_ptr += enc_size;
}
}
}
};
template <typename DeviceContext, typename T>
class AddPositionEncodingGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* dOut =
context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
auto dout = framework::EigenVector<T>::Flatten(*dOut);
auto* dX =
context.Output<framework::LoDTensor>(framework::GradVarName("X"));
dX->mutable_data<T>(context.GetPlace());
auto dx = framework::EigenVector<T>::Flatten(*dX);
float alpha = context.Attr<float>("alpha");
auto* place =
context.template device_context<DeviceContext>().eigen_device();
dx.device(*place) = dout * static_cast<T>(alpha);
}
};
} // namespace operators
} // namespace paddle

@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
ops::GatherOpKernel<int>, ops::GatherOpKernel<double>);
ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
ops::GatherOpKernel<int64_t>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
ops::GatherGradientOpKernel<double>,
ops::GatherGradientOpKernel<int>,
ops::GatherGradientOpKernel<double>);
ops::GatherGradientOpKernel<int64_t>);

@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
ops::GatherGradOpCUDAKernel<double>,
ops::GatherGradOpCUDAKernel<int64_t>,
ops::GatherGradOpCUDAKernel<int>);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save