Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into lstm_fix

mobile_baidu
dangqingqing 7 years ago
commit ed2d30b5f9

@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD
are transformed into offsets of elements/words as follows: are transformed into offsets of elements/words as follows:
``` ```
0 9 10 15 0 3 4 6
= = = = = =
3+2+4 1+9 2+3+10 3 3+1 4+2
```
so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
The complete offset representation is as follows:
```
0 9 10 15
0 3 5 9 10 12 15
||| || |||| | || |||
``` ```
## Slicing of LoD Tensors ## Slicing of LoD Tensors

@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
out); out);
in_var->SetLoDLevel(out_var->GetLodLevel()); in_var->SetLoDLevel(out_var->GetLodLevel());
} }
bool IsRuntime() const override;
protected:
VarDesc::VarType GetVarType(const std::string &name) const override;
private:
DDim GetDim(const std::string &name) const override; DDim GetDim(const std::string &name) const override;
void SetDim(const std::string &name, const DDim &dim) override; void SetDim(const std::string &name, const DDim &dim) override;
@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
info.infer_var_type_(*this, block); info.infer_var_type_(*this, block);
} else { } else {
// all output type is LoDTensor by default // all output type is LoDTensor by default
VLOG(10) << this->Type()
<< " has not registered InferVarType. Set output variables to "
"LOD_TENSOR";
for (auto &out_pair : this->outputs_) { for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) { for (auto &out_var_name : out_pair.second) {
block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR); block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
const DDim &dim) { const DDim &dim) {
block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim)); block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
} }
bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
const std::string &name) const {
return block_.FindVarRecursive(name)->GetType();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include <algorithm> #include <algorithm>
#include <atomic> #include <atomic>
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
@ -365,7 +367,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
out_tensor->set_lod(in_tensor.lod()); out_tensor->set_lod(in_tensor.lod());
} }
private: bool IsRuntime() const override { return true; }
protected:
DDim GetDim(const std::string& name) const override { DDim GetDim(const std::string& name) const override {
Variable* var = scope_.FindVar(name); Variable* var = scope_.FindVar(name);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
@ -388,6 +392,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
} }
VarDesc::VarType GetVarType(const std::string& name) const override {
auto* var = scope_.FindVar(name);
return ToVarType(var->Type());
}
private:
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
}; };

@ -298,11 +298,10 @@ class ExecutionContext {
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
const platform::CUDADeviceContext& cuda_device_context() const { const inline platform::CUDADeviceContext& cuda_device_context() const {
PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
auto cuda_ctx = return *reinterpret_cast<const platform::CUDADeviceContext*>(
reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_); &device_context_);
return *cuda_ctx;
} }
#endif #endif

@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
SetDim(names[i], dims[i]); SetDim(names[i], dims[i]);
} }
} }
std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
const std::string &name) const {
return GetVarTypes(Inputs(name));
}
std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
const std::string &name) const {
return GetVarTypes(Outputs(name));
}
std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<VarDesc::VarType> retv;
retv.resize(names.size());
std::transform(names.begin(), names.end(), retv.begin(),
std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
std::placeholders::_1));
return retv;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/framework.pb.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
@ -26,6 +27,10 @@ class InferShapeContext {
virtual bool HasInput(const std::string &name) const = 0; virtual bool HasInput(const std::string &name) const = 0;
virtual bool HasOutput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0;
std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
std::vector<VarDesc::VarType> GetOutputsVarType(
const std::string &name) const;
virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0;
@ -46,6 +51,8 @@ class InferShapeContext {
virtual void ShareLoD(const std::string &in, const std::string &out, virtual void ShareLoD(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const = 0; size_t i = 0, size_t j = 0) const = 0;
virtual bool IsRuntime() const = 0;
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
@ -55,6 +62,11 @@ class InferShapeContext {
void SetDims(const std::vector<std::string> &names, void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims); const std::vector<framework::DDim> &dims);
std::vector<VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const;
virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
}; };
} // namespace framework } // namespace framework

@ -0,0 +1,36 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
namespace paddle {
namespace framework {
inline VarDesc::VarType ToVarType(std::type_index type) {
if (type.hash_code() == typeid(LoDTensor).hash_code()) {
return VarDesc_VarType_LOD_TENSOR;
} else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
return VarDesc_VarType_LOD_RANK_TABLE;
} else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
return VarDesc_VarType_LOD_TENSOR_ARRAY;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
}
}
} // namespace framework
} // namespace paddle

@ -48,6 +48,11 @@ class Variable {
void Clear() { holder_.reset(); } void Clear() { holder_.reset(); }
std::type_index Type() const {
PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
return holder_->Type();
}
private: private:
struct Placeholder { struct Placeholder {
virtual ~Placeholder() {} virtual ~Placeholder() {}

@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_; ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config, ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
ParameterPtr parameter, ParameterPtr parameter,
@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) {
} }
void *ConvBaseProjection::getSpaceBytes(size_t size) { void *ConvBaseProjection::getSpaceBytes(size_t size) {
std::vector<MemoryHandle *> &convMem = *convMem_; std::vector<MemoryHandlePtr> &convMem = *convMem_;
if (convMem.empty()) { if (convMem.empty()) {
int numDevices = hl_get_device_count(); int numDevices = hl_get_device_count();
convMem.resize(numDevices); convMem.resize(numDevices);
} }
int devId = hl_get_device(); int devId = hl_get_device();
MemoryHandle **localMem = &(convMem[devId]); MemoryHandlePtr localMem = convMem[devId];
if (NULL == *localMem || size > (*localMem)->getAllocSize()) { if (NULL == localMem || size > localMem->getAllocSize()) {
*localMem = new GpuMemoryHandle(size); localMem = std::make_shared<GpuMemoryHandle>(size);
} }
return (*localMem)->getBuf(); return localMem->getBuf();
} }
ConvBaseProjection::~ConvBaseProjection() { ConvBaseProjection::~ConvBaseProjection() {

@ -105,7 +105,7 @@ protected:
bool bias_; bool bias_;
std::unique_ptr<Weight> weight_; std::unique_ptr<Weight> weight_;
static ThreadLocalD<std::vector<MemoryHandle*>> convMem_; static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
}; };
} // namespace paddle } // namespace paddle

@ -110,7 +110,7 @@ function(op_library TARGET)
# It's enough to just adding one operator to pybind # It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n") file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
endif() endif()
# reduce_op contains several operators # reduce_op contains several operators
if ("${TARGET}" STREQUAL "reduce_op") if ("${TARGET}" STREQUAL "reduce_op")
set(pybind_flag 1) set(pybind_flag 1)
@ -118,6 +118,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
endif() endif()
if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
endif()
# pybind USE_NO_KERNEL_OP # pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
file(READ ${TARGET}.cc TARGET_CONTENT) file(READ ${TARGET}.cc TARGET_CONTENT)
@ -161,6 +166,7 @@ set(DEPS_OPS
sequence_pool_op sequence_pool_op
lod_rank_table_op lod_rank_table_op
lstm_op lstm_op
tensor_array_read_write_op
gru_op) gru_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
@ -171,6 +177,7 @@ op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
if(WITH_GPU) if(WITH_GPU)
op_library(nccl_op DEPS nccl_common) op_library(nccl_op DEPS nccl_common)
endif() endif()

@ -72,11 +72,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
} }
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<< AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
1, PADDLE_CUDA_NUM_THREADS, 0, 1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
reinterpret_cast<const platform::CUDADeviceContext&>( num_samples, infer_width, indices_data, label_data, accuracy_data);
ctx.device_context())
.stream()>>>(num_samples, infer_width, indices_data, label_data,
accuracy_data);
} }
}; };

@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout; using DataLayout = platform::DataLayout;
using CUDADeviceContext = platform::CUDADeviceContext;
static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024; static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024;

@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout; using DataLayout = platform::DataLayout;
using CUDADeviceContext = platform::CUDADeviceContext;
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024; static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;

@ -130,9 +130,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
dim3 grid_dim(num_x_blocks, batch_size); dim3 grid_dim(num_x_blocks, batch_size);
auto stream = reinterpret_cast<const platform::CUDADeviceContext &>( auto stream = context.cuda_device_context().stream();
context.device_context())
.stream();
conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>( conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size); x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size);
@ -159,9 +157,7 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
int y_width = Y->dims()[1]; int y_width = Y->dims()[1];
int y_half_width = (y_width - 1) / 2; int y_half_width = (y_width - 1) / 2;
auto stream = reinterpret_cast<const platform::CUDADeviceContext &>( auto stream = context.cuda_device_context().stream();
context.device_context())
.stream();
const int x_per_block = 256; const int x_per_block = 256;
int num_x_blocks = div_up(x_width, x_per_block); int num_x_blocks = div_up(x_width, x_per_block);

@ -82,24 +82,19 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
int block = 512; int block = 512;
int grid = (batch_size * class_num + block - 1) / block; int grid = (batch_size * class_num + block - 1) / block;
auto stream = ctx.cuda_device_context().stream();
if (ctx.Attr<bool>("soft_label")) { if (ctx.Attr<bool>("soft_label")) {
auto* label_data = label->data<T>(); auto* label_data = label->data<T>();
SoftCrossEntropyGradientKernel<T><<< SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>( dx_data, dy_data, x_data, label_data, batch_size, class_num);
ctx.device_context())
.stream()>>>(dx_data, dy_data, x_data, label_data,
batch_size, class_num);
} else { } else {
math::SetConstant<platform::GPUPlace, T> functor; math::SetConstant<platform::GPUPlace, T> functor;
functor(ctx.device_context(), dx, 0); functor(ctx.device_context(), dx, 0);
auto* label_data = label->data<int64_t>(); auto* label_data = label->data<int64_t>();
grid = (batch_size + block - 1) / block; grid = (batch_size + block - 1) / block;
CrossEntropyGradientKernel<T><<< CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>( dx_data, dy_data, x_data, label_data, batch_size, class_num);
ctx.device_context())
.stream()>>>(dx_data, dy_data, x_data, label_data,
batch_size, class_num);
} }
} }
}; };

@ -34,15 +34,18 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
std::vector<int64_t> shape_int64(shape.size(), 0); std::vector<int64_t> shape_int64(shape.size(), 0);
std::transform(shape.begin(), shape.end(), shape_int64.begin(), std::transform(shape.begin(), shape.end(), shape_int64.begin(),
[](int a) { return static_cast<int64_t>(a); }); [](int a) { return static_cast<int64_t>(a); });
auto dims = framework::make_ddim(shape_int64); auto output_dim = framework::make_ddim(shape_int64);
int dim_idx = ctx->Attrs().Get<int>("dim_idx"); int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
PADDLE_ENFORCE_GE(dim_idx, 0); PADDLE_ENFORCE_GE(input_dim_idx, 0);
PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), dim_idx); PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), dim_idx);
dims[dim_idx] = ctx->GetInputDim("Input")[dim_idx]; int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
ctx->SetOutputDim("Out", dims); PADDLE_ENFORCE_GE(output_dim_idx, 0);
PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
ctx->SetOutputDim("Out", output_dim);
} }
protected: protected:
@ -69,8 +72,11 @@ class FillConstantBatchSizeLikeOpMaker
"(Tensor) Tensor of specified shape will be filled " "(Tensor) Tensor of specified shape will be filled "
"with the specified value"); "with the specified value");
AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output"); AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
AddAttr<int>("dim_idx", AddAttr<int>("input_dim_idx",
"(int, default 0) The index of batch size dimension") "(int, default 0) the index of input's batch size dimension")
.SetDefault(0);
AddAttr<int>("output_dim_idx",
"(int, default 0) the index of output's batch size dimension")
.SetDefault(0); .SetDefault(0);
AddAttr<float>("value", "(float, default 0) The value to be filled") AddAttr<float>("value", "(float, default 0) The value to be filled")
.SetDefault(0.0f); .SetDefault(0.0f);
@ -86,9 +92,10 @@ Fill up a variable with specified constant value.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fill_constant_batch_size_like, REGISTER_OPERATOR(fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOp, ops::FillConstantBatchSizeLikeOp,
ops::FillConstantBatchSizeLikeOpMaker); paddle::framework::EmptyGradOpMaker,
ops::FillConstantBatchSizeLikeOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_constant_batch_size_like, fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>, ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,

@ -35,7 +35,9 @@ class FillConstantOp : public framework::OperatorWithKernel {
protected: protected:
framework::DataType IndicateDataType( framework::DataType IndicateDataType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return static_cast<framework::DataType>(ctx.Attr<int>("data_type")); int data_type = ctx.Attr<int>("data_type");
VLOG(10) << " FillConstant data_type = " << data_type;
return static_cast<framework::DataType>(data_type);
} }
}; };
@ -71,4 +73,5 @@ REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>, fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>, ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>); ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, int64_t>);

@ -20,4 +20,5 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>, fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>, ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>); ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, int64_t>);

@ -31,7 +31,6 @@ class IncrementOp : public framework::OperatorWithKernel {
} }
}; };
template <typename AttrType>
class IncrementOpMaker : public framework::OpProtoAndCheckerMaker { class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
IncrementOpMaker(framework::OpProto *proto, IncrementOpMaker(framework::OpProto *proto,
@ -39,10 +38,10 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) The input tensor of increment operator"); AddInput("X", "(Tensor) The input tensor of increment operator");
AddOutput("Out", "(Tensor) The output tensor of increment operator."); AddOutput("Out", "(Tensor) The output tensor of increment operator.");
AddAttr<AttrType>("step", AddAttr<float>("step",
"(float, default 1.0) " "(float, default 1.0) "
"The step size by which the " "The step size by which the "
"input tensor will be incremented.") "input tensor will be incremented.")
.SetDefault(1.0); .SetDefault(1.0);
AddComment(R"DOC( AddComment(R"DOC(
Increment Operator. Increment Operator.
@ -73,7 +72,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker<float>, REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
ops::IncrementGradOpMaker); ops::IncrementGradOpMaker);
REGISTER_OP_CPU_KERNEL(increment, REGISTER_OP_CPU_KERNEL(
ops::IncrementKernel<paddle::platform::CPUPlace, float>); increment, ops::IncrementKernel<paddle::platform::CPUPlace, float>,
ops::IncrementKernel<paddle::platform::CPUPlace, double>,
ops::IncrementKernel<paddle::platform::CPUPlace, int>,
ops::IncrementKernel<paddle::platform::CPUPlace, int64_t>);

@ -16,4 +16,7 @@
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
increment, increment,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>); paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, double>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int64_t>);

@ -19,7 +19,7 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename Place, typename T, typename AttrType = T> template <typename Place, typename T>
class IncrementKernel : public framework::OpKernel<T> { class IncrementKernel : public framework::OpKernel<T> {
public: public:
virtual void Compute(const framework::ExecutionContext& context) const { virtual void Compute(const framework::ExecutionContext& context) const {
@ -27,7 +27,7 @@ class IncrementKernel : public framework::OpKernel<T> {
auto* in = context.Input<framework::Tensor>("X"); auto* in = context.Input<framework::Tensor>("X");
tensor->mutable_data<T>(in->place()); tensor->mutable_data<T>(in->place());
auto step = static_cast<T>(context.Attr<AttrType>("step")); auto step = static_cast<T>(context.Attr<float>("step"));
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor); auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*in); auto eigen_in = framework::EigenVector<T>::Flatten(*in);

@ -74,10 +74,9 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grids(8, 1); dim3 grids(8, 1);
LookupTable<T, 128, 8, 8><<< LookupTable<T, 128, 8,
grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>( 8><<<grids, threads, 0, context.device_context().stream()>>>(
context.device_context()) output, table, ids, N, K, D);
.stream()>>>(output, table, ids, N, K, D);
} }
}; };
@ -95,9 +94,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto* ids_data = ids->data<int64_t>(); auto* ids_data = ids->data<int64_t>();
auto ids_dim = ids->dims(); auto ids_dim = ids->dims();
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>( auto stream = context.cuda_device_context().stream();
context.device_context())
.stream();
// copy GPU memory to CPU pinned memory // copy GPU memory to CPU pinned memory
framework::Vector<int64_t> new_rows; framework::Vector<int64_t> new_rows;
new_rows.resize(ids_dim[0]); new_rows.resize(ids_dim[0]);
@ -136,11 +133,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grids(8, 1); dim3 grids(8, 1);
LookupTableGrad<T, 128, 8, LookupTableGrad<
8><<<grids, threads, 0, T, 128, 8,
reinterpret_cast<const platform::CUDADeviceContext&>( 8><<<grids, threads, 0, context.device_context().stream()>>>(
context.device_context()) d_table, d_output, ids, N, K, D);
.stream()>>>(d_table, d_output, ids, N, K, D);
} }
} }
}; };

@ -1,3 +1 @@
if(WITH_AVX) cc_library(activation_functions SRCS avx_functions.cc)
cc_library(activation_functions SRCS avx_functions.cc)
endif()

@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef __AVX__
#include <immintrin.h> #include <immintrin.h>
#include "paddle/operators/math/detail/activation_functions.h" #include "paddle/operators/math/detail/activation_functions.h"
// TODO(qingqing) refine this dependence // TODO(qingqing) refine this dependence
@ -84,3 +86,5 @@ __m256 Identity(const __m256 a, const __m256 b) { return a; }
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save