Merge branch 'develop' into mkldnn_test

ce
Tao Luo 7 years ago
commit 75bb0babef

@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
@ -84,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))

@ -64,7 +64,8 @@ class OpHandleBase {
virtual bool IsMultiDeviceTransfer() { return false; }
const platform::DeviceContext *DeviceContext(platform::Place place) {
return dev_ctxes_[place];
auto it = dev_ctxes_.find(place);
return it != dev_ctxes_.end() ? it->second : nullptr;
}
void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {

@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(5) << "destroy ExecutorPrepareContext";
}
template <typename RefCntMap>
static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
GarbageCollector<Tensor>* gc,
RefCntMap* ref_cnts) {
std::unordered_set<Tensor*> erase_tensors;
auto handler = [&](const VariableNameMap& name_map) {
for (auto& name_pair : name_map) {
for (auto& name : name_pair.second) {
auto it = ref_cnts->find(name);
if (it == ref_cnts->end()) continue;
if ((it->second)-- == 1) {
auto* var = scope.FindVar(name);
if (var != nullptr) {
VLOG(10) << "Erase tensor \'" << name << "\'";
if (var->IsType<LoDTensor>()) {
erase_tensors.insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>()) {
erase_tensors.insert(
var->GetMutable<SelectedRows>()->mutable_value());
}
}
}
}
}
};
handler(op->Inputs());
handler(op->Outputs());
if (!erase_tensors.empty()) {
gc->Add(erase_tensors);
}
}
Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() {
@ -66,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>();
var->GetMutable<std::vector<framework::Scope*>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
}
int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector<Tensor>> gc;
if (max_memory_size >= 0) {
// WhileOp would set keep_kids to false
// WhileGradOp would need the scopes created in WhileOp
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.
if (max_memory_size >= 0 && !keep_kids) {
ctx->ResetReferenceCount();
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
op->Run(*local_scope, place_);
if (gc != nullptr) {
std::vector<std::string> erase_vars;
for (auto& input : op->Inputs()) {
for (auto& input_name : input.second) {
auto it = ctx->cur_ref_cnts_.find(input_name);
if (it == ctx->cur_ref_cnts_.end()) continue;
if (it->second == 1) { // should delete it
erase_vars.emplace_back(input_name);
ctx->cur_ref_cnts_.erase(input_name);
} else {
--(it->second);
}
}
}
for (auto& output : op->Outputs()) {
for (auto& output_name : output.second) {
auto it = ctx->cur_ref_cnts_.find(output_name);
if (it == ctx->cur_ref_cnts_.end()) continue;
if (it->second == 1) {
erase_vars.emplace_back(output_name);
ctx->cur_ref_cnts_.erase(output_name);
} else {
--(it->second);
}
}
}
if (!erase_vars.empty()) {
std::vector<framework::LoDTensor*> erase_tensors;
for (auto& name : erase_vars) {
auto* var = local_scope->FindVar(name);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
auto* tensor = var->GetMutable<framework::LoDTensor>();
erase_tensors.push_back(tensor);
}
}
if (!erase_tensors.empty()) gc->Add(erase_tensors);
}
DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
&(ctx->cur_ref_cnts_));
}
if (FLAGS_benchmark) {

@ -32,38 +32,32 @@ template <typename T>
std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
const ProgramDesc& prog, size_t block_id) {
auto& block = prog.Block(block_id);
std::unordered_set<std::string> ignored_vars;
std::unordered_map<std::string, T> ref_cnts;
for (auto var_desc : block.AllVars()) {
auto type = var_desc->Proto()->type().type();
if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
ignored_vars.insert(var_desc->Name()); // ignore persistable vars
}
}
for (auto op_desc : block.AllOps()) {
for (auto& input : op_desc->Inputs()) {
for (auto& input_name : input.second) {
if (!ignored_vars.count(input_name)) {
if (ref_cnts.count(input_name))
++ref_cnts[input_name];
else
ref_cnts[input_name] = 1;
auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
for (auto& name_pair : name_map) {
for (auto& name : name_pair.second) {
auto* var_desc = block.FindVar(name);
if (var_desc == nullptr || var_desc->Persistable()) continue;
auto type = var_desc->Proto()->type().type();
if (type != proto::VarType::LOD_TENSOR &&
type != proto::VarType::SELECTED_ROWS) {
continue;
}
}
}
for (auto& output : op_desc->Outputs()) {
for (auto output_name : output.second) {
if (!ignored_vars.count(output_name)) {
if (ref_cnts.count(output_name))
++ref_cnts[output_name];
else
ref_cnts[output_name] = 1;
auto it = ref_cnts.find(name);
if (it != ref_cnts.end()) {
++it->second;
} else {
ref_cnts[name] = 1;
}
}
}
};
for (auto op_desc : block.AllOps()) {
update_ref_cnts(op_desc, op_desc->Inputs());
update_ref_cnts(op_desc, op_desc->Outputs());
}
return ref_cnts;
}

@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs =
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}

@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>();
var->GetMutable<std::vector<framework::Scope *>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {

@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id);
#endif
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
// The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
} else {
RunImpl(scope, place);
}
VLOG(3) << place << " " << DebugStringEx(&scope);
}

@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() {
}
}
}
// member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_.reset();
}
} // namespace framework

@ -75,7 +75,7 @@ class ParallelExecutor {
private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
ParallelExecutorPrivate *member_;
std::unique_ptr<ParallelExecutorPrivate> member_;
#ifdef PADDLE_WITH_CUDA
// ref_cnts_ is only initialized when ParallelExecutor constructs, and then

@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
kids_.push_back(new Scope(this));
return *kids_.back();
}
Variable* Scope::Var(const std::string& name) {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
return VarInternal(name);
}
Variable* Scope::Var(std::string* name) {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) {
*name = new_name;
@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
}
Variable* Scope::FindVar(const std::string& name) const {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
return FindVarInternal(name);
}
Variable* Scope::FindLocalVar(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_);
return FindVarLocally(name);
}
const Scope* Scope::FindScope(const Variable* var) const {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
return FindScopeInternal(var);
}
void Scope::DropKids() {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
for (Scope* s : kids_) delete s;
kids_.clear();
}
bool Scope::HasKid(const Scope* scope) const {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end();
}
std::vector<std::string> Scope::LocalVarNames() const {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
std::vector<std::string> known_vars;
known_vars.reserve(this->vars_.size());
for (auto& p : vars_) {
@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
}
void Scope::DeleteScope(Scope* scope) const {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it);
@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
}
void Scope::EraseVars(const std::vector<std::string>& var_names) {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) {
@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
RenameInternal(origin_name, new_name);
}
std::string Scope::Rename(const std::string& origin_name) const {
std::unique_lock<std::mutex> lock(mutex_);
std::lock_guard<std::mutex> lock(mutex_);
auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name);
return new_name;

@ -63,6 +63,11 @@ class Scope {
/// Caller doesn't own the returned Variable.
Variable* FindVar(const std::string& name) const;
/// Find a variable in the current scope.
/// Return nullptr if cannot find.
/// Caller doesn't own the returned Variable.
Variable* FindLocalVar(const std::string& name) const;
const Scope* parent() const { return parent_; }
/// Find the scope or an ancestor scope that contains the given variable.

@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);

@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
Tensor slice_tensor = src_tensor.Slice(1, 2);
@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
// Copy the same tensor
TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
gpu_ctx.Wait();
const int* dst_ptr_tmp = dst_tensor.data<int>();
EXPECT_NE(src_ptr, dst_ptr_tmp);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
}
Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor

@ -59,6 +59,7 @@ class VarDesc {
public:
explicit VarDesc(const std::string &name) {
desc_.set_name(name);
// TODO(paddle-dev): Why default to lodtensor.
desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
}

@ -38,8 +38,12 @@ class Variable {
template <typename T>
T* GetMutable() {
if (!IsType<T>()) {
if (!holder_) {
holder_.reset(new PlaceholderImpl<T>(new T()));
} else {
PADDLE_ENFORCE(IsType<T>(),
"Variable must be type %s, the holding type is %s",
typeid(T).name(), holder_->Type().name());
}
return static_cast<T*>(holder_->Ptr());
}

@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
const Tensor& tt = v->Get<Tensor>();
EXPECT_EQ(1234, tt.content_);
std::string* s = v->GetMutable<std::string>();
*s = "hello";
const std::string& ss = v->Get<std::string>();
EXPECT_EQ("hello", ss);
try {
v->GetMutable<std::string>();
} catch (std::exception& e) {
return;
}
EXPECT_TRUE(false);
}

@ -25,9 +25,11 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads);
namespace paddle {
@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
}
#endif
// no matter with or without MKLDNN
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
@ -335,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
}
return true;
}
AnalysisPredictor::~AnalysisPredictor() {
#if !defined(_WIN32)
if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log");
}
#endif
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
}
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
auto *x = new AnalysisPredictor(config_);
x->Init(scope_, inference_program_);

@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor {
template <typename T>
void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data);
~AnalysisPredictor();
private:
contrib::AnalysisConfig config_;

@ -23,9 +23,11 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid");
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace {
@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init(
}
#endif
// no matter with or without MKLDNN
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {

@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory)
op_library(fusion_lstm_op DEPS cpu_lstm_compute)
op_library(fusion_lstm_op DEPS jit_kernel)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub)

@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
@ -199,23 +200,9 @@ struct SparseAdamFunctor {
row_numel_(row_numel),
row_count_(row_count) {}
inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
int64_t beg = 0, end = row_count_ - 1;
while (beg <= end) {
auto mid = ((beg + end) >> 1);
if (rows_[mid] == row)
return mid;
else if (rows_[mid] < row)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
inline HOSTDEVICE void operator()(size_t i) const {
int64_t row = i / row_numel_;
auto row_idx = BinarySearchInRows(row);
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
// The following code is the same as dense

@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase {
}
};
class FillConstantOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
};
class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
@ -102,4 +108,5 @@ Fill up a variable with specified constant value.
namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
ops::FillConstantInferShape, ops::FillConstantOpMaker,
paddle::framework::EmptyGradOpMaker);
paddle::framework::EmptyGradOpMaker,
ops::FillConstantOpVarTypeInference);

File diff suppressed because it is too large Load Diff

@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor) 1-dim tensor, contains a bool scalar. The output "
"tensor of overflow operator.");
AddComment(string::Sprintf(R"DOC(
Overflow operator.
Overflow %s operator.
$$Out = any(X)$$
@ -69,6 +69,8 @@ Out = Inf if any X contains Inf,
Out = Nan if any X contains Nan,
Out = 0 if no Inf/Nan detected.
If X contains both Inf/Nan, it will return the first indicator it meeted.
%s
)DOC",
GetName(), GetComments()));
}

@ -45,8 +45,6 @@ math_library(im2col)
if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions)
# TODO(TJ): ugly workaround, clean me
cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info)
endif (NOT WIN32)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
@ -76,3 +74,7 @@ if(WITH_GPU)
endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
cc_library(jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
DEPS cpu_info cblas activation_functions)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save