Update and Resolve conflicts

mobile_baidu
dangqingqing 8 years ago
commit 483947c45d

@ -45,8 +45,9 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
cc_library(backward SRCS backward.cc DEPS net_op) cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

@ -21,7 +21,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
@ -70,10 +72,14 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
var->GetMutable<FeedFetchList>(); var->GetMutable<FeedFetchList>();
} else if (var_type == VarDesc::STEP_SCOPES) { } else if (var_type == VarDesc::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>(); var->GetMutable<std::vector<framework::Scope>>();
} else if (var_type == VarDesc::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"Variable type %d is not in " "Variable type %d is not in "
"[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST]", "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]",
var_type); var_type);
} }
} }

@ -109,6 +109,11 @@ message LoDTensorDesc {
optional int32 lod_level = 2 [ default = 0 ]; optional int32 lod_level = 2 [ default = 0 ];
} }
message LoDTensorArrayDesc {
required TensorDesc tensor = 1;
optional int32 lod_level = 2 [ default = 0 ];
}
message VarDesc { message VarDesc {
enum VarType { enum VarType {
LOD_TENSOR = 1; LOD_TENSOR = 1;
@ -116,11 +121,14 @@ message VarDesc {
FEED_MINIBATCH = 3; FEED_MINIBATCH = 3;
FETCH_LIST = 4; FETCH_LIST = 4;
STEP_SCOPES = 5; STEP_SCOPES = 5;
LOD_RANK_TABLE = 6;
LOD_TENSOR_ARRAY = 7;
} }
required string name = 1; required string name = 1;
required VarType type = 2; required VarType type = 2;
optional LoDTensorDesc lod_tensor = 3; optional LoDTensorDesc lod_tensor = 3;
optional TensorDesc selected_rows = 4; optional TensorDesc selected_rows = 4;
optional LoDTensorArrayDesc tensor_array = 6;
optional bool persistable = 5 [ default = false ]; optional bool persistable = 5 [ default = false ];
} }

@ -0,0 +1,48 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
namespace paddle {
namespace framework {
void LoDRankTable::Reset(const LoD& lod, size_t level) {
this->coarse_lod_.clear();
this->items_.clear();
PADDLE_ENFORCE(level < lod.size(),
"Cannot rank lod since the level %d is less than lod size %d",
level, lod.size());
coarse_lod_.reserve(level);
for (size_t i = 0; i < level; ++i) {
coarse_lod_.push_back(lod[i]);
}
auto& vec = lod[level];
for (size_t i = 0; i < vec.size() - 1; ++i) {
TableItem item;
item.index = i;
item.length = vec[i + 1] - vec[i];
items_.emplace_back(item);
}
// NOTE(yuyang18):
//
// The time complexity of stable_sort is O(N*log(N)) if additional memory is
// available. It is easy to debug and unit test when using `stable_sort`
// instead of `sort`. Also, the items of a rank table will not be too large.
std::stable_sort(items_.begin(), items_.end(),
[](const TableItem& a, const TableItem& b) {
return a.length > b.length;
});
}
} // namespace framework
} // namespace paddle

@ -0,0 +1,55 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
namespace paddle {
namespace framework {
// LoD Rank Table stores the `level` of `lod` which is ordered by sequence
// length in descending order. It is useful when implement dynamic RNN and is
// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
// output operators.
//
// The table item contains two element. The length of sequence and the index of
// sequence in that level.
//
// LoDRankTable also stores the coarse_lod, which is the lod information whose
// level is less than input level, in order to restore the output LoD
// information.
class LoDRankTable {
public:
struct TableItem {
size_t index;
size_t length;
};
LoDRankTable() {}
void Reset(const LoD& lod, size_t level);
const std::vector<TableItem>& items() const { return this->items_; }
const LoD& coarse_lod() const { return this->coarse_lod_; }
size_t level() const { return coarse_lod_.size(); }
private:
LoD coarse_lod_;
std::vector<TableItem> items_;
};
} // namespace framework
} // namespace paddle

@ -135,5 +135,43 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty."); PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
ShareDataWith(Slice(begin, end)); ShareDataWith(Slice(begin, end));
} }
void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
std::vector<std::vector<size_t>>* lod_length,
size_t* start_offset) {
lod_length->clear();
PADDLE_ENFORCE(start_idx < lod.size() - 1,
"start_idx should be >= 0 and < lod.size() - 1.");
PADDLE_ENFORCE(end_idx < lod.size(),
"end_idx should be >= 0 and < lod.size().");
PADDLE_ENFORCE_LE(start_idx, end_idx,
"start_idx should be less than end_idx.");
for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
std::vector<size_t> level_lens;
for (size_t i = start_idx; i < end_idx; ++i) {
level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
}
lod_length->emplace_back(level_lens);
start_idx = lod[level_idx][start_idx];
end_idx = lod[level_idx][end_idx];
}
*start_offset = start_idx;
}
void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
PADDLE_ENFORCE_EQ(
lod->size(), lod_length.size(),
"The lod_length should has the same size with the appended lod.");
for (size_t i = 0; i < lod->size(); ++i) {
auto& level = (*lod)[i];
if (level.empty()) {
level.push_back(0);
}
for (size_t len : lod_length[i]) {
level.push_back(level.back() + len);
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -181,5 +181,11 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
return tensor; return tensor;
} }
void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
std::vector<std::vector<size_t>>* lod_length,
size_t* start_offset);
void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -0,0 +1,23 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/framework/lod_tensor.h"
namespace paddle {
namespace framework {
using LoDTensorArray = std::vector<LoDTensor>;
}
} // namespace paddle

@ -144,5 +144,47 @@ TEST(LodExpand, test) {
} }
} }
TEST(LoD, GetFineGrainedLoDLength) {
LoD lod;
lod.push_back(std::vector<size_t>{0, 2, 4, 5});
lod.push_back(std::vector<size_t>{0, 1, 6, 8, 10, 11});
lod.push_back(
std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
std::vector<std::vector<size_t>> lod_length;
size_t start_offset;
paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
&start_offset);
std::vector<std::vector<size_t>> expected;
expected.push_back(std::vector<size_t>{2});
expected.push_back(std::vector<size_t>{2, 2});
expected.push_back(std::vector<size_t>{2, 3, 4, 2});
EXPECT_EQ(lod_length, expected);
EXPECT_EQ(start_offset, 15UL);
}
TEST(LoD, AppendLoD) {
std::vector<std::vector<size_t>> lod_lens;
lod_lens.push_back(std::vector<size_t>{2});
lod_lens.push_back(std::vector<size_t>{2, 2});
lod_lens.push_back(std::vector<size_t>{2, 3, 4, 2});
LoD origin;
origin.push_back(std::vector<size_t>{0, 2});
origin.push_back(std::vector<size_t>{0, 1, 6});
origin.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15});
paddle::framework::AppendLoD(&origin, lod_lens);
LoD expected;
expected.push_back(std::vector<size_t>{0, 2, 4});
expected.push_back(std::vector<size_t>{0, 1, 6, 8, 10});
expected.push_back(
std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
EXPECT_EQ(origin, expected);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -408,7 +408,6 @@ class OperatorWithKernel : public OperatorBase {
// indicate kernel DataType by input data. Defaultly all input data must be // indicate kernel DataType by input data. Defaultly all input data must be
// same. // same.
virtual DataType IndicateDataType(const ExecutionContext& ctx) const { virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
VLOG(3) << "Default IndicateDataType " << this->Type();
auto& scope = ctx.scope(); auto& scope = ctx.scope();
int data_type = -1; int data_type = -1;
for (auto& input : this->inputs_) { for (auto& input : this->inputs_) {
@ -425,7 +424,6 @@ class OperatorWithKernel : public OperatorBase {
} }
if (t != nullptr) { if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type())); int tmp = static_cast<int>(ToDataType(t->type()));
VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
PADDLE_ENFORCE(tmp == data_type || data_type == -1, PADDLE_ENFORCE(tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same.", "DataType of Paddle Op %s must be the same.",
Type()); Type());

@ -37,13 +37,27 @@ std::vector<int64_t> VarDescBind::Shape() const {
DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); } DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
void VarDescBind::SetLoDLevel(int32_t lod_level) { void VarDescBind::SetLoDLevel(int32_t lod_level) {
PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR); switch (desc_.type()) {
desc_.mutable_lod_tensor()->set_lod_level(lod_level); case VarDesc::LOD_TENSOR:
desc_.mutable_lod_tensor()->set_lod_level(lod_level);
break;
case VarDesc::LOD_TENSOR_ARRAY:
desc_.mutable_tensor_array()->set_lod_level(lod_level);
break;
default:
PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
}
} }
int32_t VarDescBind::GetLodLevel() const { int32_t VarDescBind::GetLodLevel() const {
PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR); switch (desc_.type()) {
return desc_.lod_tensor().lod_level(); case VarDesc::LOD_TENSOR:
return desc_.lod_tensor().lod_level();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().lod_level();
default:
PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
}
} }
const TensorDesc &VarDescBind::tensor_desc() const { const TensorDesc &VarDescBind::tensor_desc() const {
@ -53,6 +67,8 @@ const TensorDesc &VarDescBind::tensor_desc() const {
return desc_.selected_rows(); return desc_.selected_rows();
case VarDesc::LOD_TENSOR: case VarDesc::LOD_TENSOR:
return desc_.lod_tensor().tensor(); return desc_.lod_tensor().tensor();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().tensor();
default: default:
PADDLE_THROW("Unexpected branch."); PADDLE_THROW("Unexpected branch.");
} }
@ -66,6 +82,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() {
return desc_.mutable_selected_rows(); return desc_.mutable_selected_rows();
case VarDesc::LOD_TENSOR: case VarDesc::LOD_TENSOR:
return desc_.mutable_lod_tensor()->mutable_tensor(); return desc_.mutable_lod_tensor()->mutable_tensor();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.mutable_tensor_array()->mutable_tensor();
default: default:
PADDLE_THROW("Unexpected branch."); PADDLE_THROW("Unexpected branch.");
} }

@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "glog/logging.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
namespace paddle { namespace paddle {

@ -0,0 +1,154 @@
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNAddtoLayer.h"
using namespace mkldnn; // NOLINT
namespace paddle {
REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
return false;
}
layerSize_ = getSize();
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
}
if (biasParameter_.get() != NULL) {
biases_ =
std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
}
return true;
}
void MKLDNNAddtoLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
}
oc = ic;
oh = ih;
ow = iw;
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
printSizeInfo();
}
void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
if (biases_) {
LOG(FATAL) << "not implemented yet";
}
resetFwdBuffers(inVals_, out);
in = inVals_[0];
std::shared_ptr<sum::primitive_desc> fwdPD;
resetFwdPD(fwdPD, inVals_, out);
resetFwdPipeline(pipeline, fwdPD, inVals_, out);
}
void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetBwdBuffers(inGrads_, out);
in = inGrads_[0];
// backward only need share output grad to input grad
for (size_t i = 0; i < inGrads_.size(); i++) {
if (inGrads_[i] != nullptr) {
inGrads_[i] = out;
inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
}
}
}
void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
if (biases_ && biases_->getWGrad()) {
biases_->getParameterPtr()->incUpdate(callback);
}
}
void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
resetInValue(inputs[i], nullptr, i);
CHECK(inputs[i]);
inputs[i]->downSpatial();
}
for (size_t i = 1; i < inputs.size(); i++) {
CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
}
resetOutValue(out, inputs[0]->getPrimitiveDesc());
}
void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out) {
std::vector<double> scales(inputs.size(), 1.0);
std::vector<memory::primitive_desc> srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
}
CHECK(out);
pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
}
void MKLDNNAddtoLayer::resetFwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::vector<primitive::at> srcs;
for (size_t i = 0; i < inputs.size(); i++) {
srcs.push_back(*(inputs[i]));
}
fwd_.reset(new sum(*pd, srcs, *out));
pipeline.push_back(*fwd_);
}
void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
CHECK(out);
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
}
}
} // namespace paddle

@ -0,0 +1,110 @@
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
/**
* @brief A subclass of MKLDNNLayer Addto layer.
*
* The config file api is mkldnn_addto
*/
class MKLDNNAddtoLayer : public MKLDNNLayer {
protected:
std::vector<MKLDNNMatrixPtr> inVals_;
std::vector<MKLDNNMatrixPtr> inGrads_;
// layer size == ic * ih * iw == oc * oh *ow, and can not be changed
size_t layerSize_;
// TODO(TJ): this part has not been optimized by MKL-DNN
std::unique_ptr<Weight> biases_;
public:
explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
~MKLDNNAddtoLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override;
void printValueFormat() override {
for (size_t i = 0; i < inVals_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
}
void printGradFormat() override {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
for (size_t i = 0; i < inGrads_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
}
}
protected:
/**
* Forward functions: reset buffers(inputs, output, bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
*/
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
};
} // namespace paddle

@ -77,7 +77,7 @@ void MKLDNNLayer::forward(PassType passType) {
needResetBwd_ = true; needResetBwd_ = true;
} }
if (inputLayers_[0]->getType() == "data") { if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
// Update input value data when input layer is "data" type, // Update input value data when input layer is "data" type,
// since the input value data address might be changed. // since the input value data address might be changed.
CHECK(extInVal_); CHECK(extInVal_);
@ -171,14 +171,16 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
} }
void MKLDNNLayer::resetInValue( void MKLDNNLayer::resetInValue(
MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) { MKLDNNMatrixPtr& in,
const std::shared_ptr<memory::primitive_desc>& intPD,
size_t inputIdx) {
cvtInVal_ = nullptr; cvtInVal_ = nullptr;
extInVal_ = nullptr; extInVal_ = nullptr;
in = nullptr; in = nullptr;
CHECK_GT(bs_ * ic_ * ih_ * iw_, 0); CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
auto extPD = MKLDNNMatrix::createPrimitiveDesc( auto extPD = MKLDNNMatrix::createPrimitiveDesc(
{bs_, ic_, ih_, iw_}, format::nchw, engine_); {bs_, ic_, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[0]->getOutputValue(); const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat); in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr); CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
if (in == nullptr || in->getFormat() == format::nc) { if (in == nullptr || in->getFormat() == format::nc) {
@ -216,11 +218,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
} }
void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
memory::primitive_desc intPD) { memory::primitive_desc intPD,
size_t inputIdx) {
cvtInGrad_ = nullptr; cvtInGrad_ = nullptr;
extInGrad_ = nullptr; extInGrad_ = nullptr;
in = nullptr; in = nullptr;
LayerPtr& input = inputLayers_[0]; LayerPtr& input = inputLayers_[inputIdx];
if (input->getOutputGrad() == nullptr) { if (input->getOutputGrad() == nullptr) {
// no need input grad // no need input grad
return; return;
@ -245,7 +248,6 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
return; return;
} }
// need create reorder // need create reorder
// TODO(TJ): add macro definition to simplify it
CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
<< "should have external input value and the format must be nchw(nc)"; << "should have external input value and the format must be nchw(nc)";
extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat); extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);

@ -199,7 +199,8 @@ protected:
*/ */
void resetInValue( void resetInValue(
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr); const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
size_t inputIdx = 0);
/** /**
* reset output value from internal primitive desc. * reset output value from internal primitive desc.
@ -212,7 +213,9 @@ protected:
* reset input grad from internal primitive desc. * reset input grad from internal primitive desc.
* reset both internal and external buffer and create reorder if necessary. * reset both internal and external buffer and create reorder if necessary.
*/ */
void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD); void resetInGrad(MKLDNNMatrixPtr& in,
mkldnn::memory::primitive_desc intPD,
size_t inputIdx = 0);
/** /**
* reset output grad from internal primitive desc. * reset output grad from internal primitive desc.

@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() {
VLOG(MKLDNN_TESTS) << "Check Forward"; VLOG(MKLDNN_TESTS) << "Check Forward";
printTopDatas(); printTopDatas();
double delta = double delta =
compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue()); compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
EXPECT_LE(fabs(delta), eps_); EXPECT_LE(fabs(delta), eps_);
} }
@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() {
VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i; VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
printMatrix(refDiff); printMatrix(refDiff);
double delta = compareMatrix(dnnDiff, refDiff); double delta = compareMatrix(refDiff, dnnDiff);
EXPECT_LE(fabs(delta), eps_); EXPECT_LE(fabs(delta), eps_);
if (isBN) { if (isBN) {
// the other two inputs in batch norm are for moving mean and var // the other two inputs in batch norm are for moving mean and var
@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() {
<< parameters_[REF][i]->getName(); << parameters_[REF][i]->getName();
printVector(ref); printVector(ref);
double delta = compareVector(dnn, ref); double delta = compareVector(ref, dnn);
EXPECT_LE(fabs(delta), eps_); EXPECT_LE(fabs(delta), eps_);
} }

@ -271,20 +271,53 @@ TEST(MKLDNNLayer, BatchNormLayer) {
testBatchNormLayer({16, 32, 16, 16}); testBatchNormLayer({16, 32, 16, 16});
} }
struct testActDesc { struct testImageDesc {
int bs, ic, ih, iw; int bs, ic, ih, iw;
}; };
static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) { static void getAddtoConfig(TestConfig& cfg,
const testImageDesc& pm,
const size_t nInputs = 1) {
cfg.biasSize = 0; cfg.biasSize = 0;
cfg.layerConfig.set_type("addto"); cfg.layerConfig.set_type("addto");
size_t layerSize = pm.ic * pm.ih * pm.iw; size_t layerSize = pm.ic * pm.ih * pm.iw;
cfg.layerConfig.set_size(layerSize); cfg.layerConfig.set_size(layerSize);
cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0}); cfg.layerConfig.set_active_type("relu");
cfg.layerConfig.add_inputs(); for (size_t i = 0; i < nInputs; ++i) {
std::stringstream ss;
ss << "layer_" << i;
cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
ImageConfig* img_conf = input->mutable_image_conf();
img_conf->set_channels(pm.ic);
img_conf->set_img_size_y(pm.ih);
img_conf->set_img_size(pm.iw);
}
}
void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
CHECK_GE(nInputs, 1);
TestConfig dnnConfig;
getAddtoConfig(dnnConfig, pm, nInputs);
dnnConfig.layerConfig.set_type("mkldnn_addto");
// TODO(TJ): test with bias
for (auto withBias : {false}) {
if (withBias) {
dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
} else {
dnnConfig.biasSize = 0;
}
RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
}
}
TEST(MKLDNNLayer, AddtoLayer) {
testAddtoLayer({16, 5, 14, 14}, 1);
testAddtoLayer({8, 10, 8, 8}, 2);
testAddtoLayer({4, 12, 1, 1}, 3);
} }
void testActivation(std::string actType, const testActDesc& pm) { void testActivation(std::string actType, const testImageDesc& pm) {
// TODO(TJ): remove me when paddle support elu activation // TODO(TJ): remove me when paddle support elu activation
if (actType == "mkldnn_elu") { if (actType == "mkldnn_elu") {
return; return;

@ -142,6 +142,7 @@ set(DEPS_OPS
nccl_op nccl_op
sequence_conv_op sequence_conv_op
sequence_pool_op sequence_pool_op
lod_rank_table_op
lstm_op) lstm_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
@ -150,6 +151,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(sum_op DEPS net_op selected_rows_functor) op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
if(WITH_GPU) if(WITH_GPU)
op_library(nccl_op DEPS nccl_common) op_library(nccl_op DEPS nccl_common)
endif() endif()

@ -33,7 +33,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
auto inference_dim = ctx->GetInputDim("Out"); auto inference_dim = ctx->GetInputDim("Out");
auto label_dim = ctx->GetInputDim("Label"); auto label_dim = ctx->GetInputDim("Label");
// Assume indices has same shape with infernece, because // Assume indices has same shape as inference, because
// it's the output of topk. // it's the output of topk.
PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2."); PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
@ -60,20 +60,24 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
// TODO(typhoonzero): support both inference value and indices. // TODO(typhoonzero): support both inference value and indices.
AddInput("Out", "topk (inferences) the network output"); AddInput("Out", "The network output of topk (inferences)");
AddInput("Indices", "topk (indices) the network output"); AddInput("Indices", "The the network output of topk (indices)");
AddInput("Label", "Label of the training data"); AddInput("Label", "Label of the training data");
// TODO(typhoonzero): AddInput("Weight", ... // TODO(typhoonzero): AddInput("Weight", ...
AddOutput("Accuracy", "The accuracy of current batch"); AddOutput("Accuracy", "The accuracy of current batch");
AddComment(R"DOC( AddComment(R"DOC(
Accuracy. It will print accuracy rate for classification. Accuracy Operator.
The accuracy is:
.. math:: It will print accuracy rate for classification.
accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples}) The accuracy is calculated as follows:
$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
Both the input Out and Label can carry the LoD (Level of Details)
information, or not. But the output only shares the LoD information
with the input Out(Inference).
Both the input `Out` and `Label` can carry the LoD (Level of Details)
information, or not. But the output only shares the LoD with input `Inference`.
)DOC"); )DOC");
} }
}; };

@ -44,7 +44,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Sigmoid operator"); AddInput("X", "Input of Sigmoid operator");
AddOutput("Y", "Output of Sigmoid operator"); AddOutput("Y", "Output of Sigmoid operator");
AddComment(R"DOC( AddComment(R"DOC(
Sigmoid activation operator. Sigmoid Activation Operator.
$y = 1 / (1 + e^{-x})$ $y = 1 / (1 + e^{-x})$
@ -60,7 +60,7 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of LogSigmoid operator"); AddInput("X", "Input of LogSigmoid operator");
AddOutput("Y", "Output of LogSigmoid operator"); AddOutput("Y", "Output of LogSigmoid operator");
AddComment(R"DOC( AddComment(R"DOC(
Logsigmoid activation operator. Logsigmoid Activation Operator.
$y = \log(1 / (1 + e^{-x}))$ $y = \log(1 / (1 + e^{-x}))$
@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Exp operator"); AddInput("X", "Input of Exp operator");
AddOutput("Y", "Output of Exp operator"); AddOutput("Y", "Output of Exp operator");
AddComment(R"DOC( AddComment(R"DOC(
Exp activation operator. Exp Activation Operator.
$y = e^x$ $y = e^x$
@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Relu operator"); AddInput("X", "Input of Relu operator");
AddOutput("Y", "Output of Relu operator"); AddOutput("Y", "Output of Relu operator");
AddComment(R"DOC( AddComment(R"DOC(
Relu activation operator. Relu Activation Operator.
$y = \max(x, 0)$ $y = \max(x, 0)$
@ -109,7 +109,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("alpha", "The small negative slope") AddAttr<AttrType>("alpha", "The small negative slope")
.SetDefault(static_cast<AttrType>(0.02f)); .SetDefault(static_cast<AttrType>(0.02f));
AddComment(R"DOC( AddComment(R"DOC(
LeakyRelu activation operator. LeakyRelu Activation Operator.
$y = \max(x, \alpha * x)$ $y = \max(x, \alpha * x)$
@ -128,7 +128,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("lambda", "non-negative offset") AddAttr<AttrType>("lambda", "non-negative offset")
.SetDefault(static_cast<AttrType>(0.5f)); .SetDefault(static_cast<AttrType>(0.5f));
AddComment(R"DOC( AddComment(R"DOC(
Softshrink activation operator. Softshrink Activation Operator.
$$ $$
y = \begin{cases} y = \begin{cases}
@ -149,7 +149,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Tanh operator"); AddInput("X", "Input of Tanh operator");
AddOutput("Y", "Output of Tanh operator"); AddOutput("Y", "Output of Tanh operator");
AddComment(R"DOC( AddComment(R"DOC(
Tanh activation operator. Tanh Activation Operator.
$$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
@ -165,7 +165,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of TanhShrink operator"); AddInput("X", "Input of TanhShrink operator");
AddOutput("Y", "Output of TanhShrink operator"); AddOutput("Y", "Output of TanhShrink operator");
AddComment(R"DOC( AddComment(R"DOC(
TanhShrink activation operator. TanhShrink Activation Operator.
$$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
@ -184,7 +184,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("threshold", "The value of threshold for HardShrink") AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
.SetDefault(static_cast<AttrType>(0.5)); .SetDefault(static_cast<AttrType>(0.5));
AddComment(R"DOC( AddComment(R"DOC(
HardShrink activation operator. HardShrink Activation Operator.
$$ $$
y = \begin{cases} y = \begin{cases}
@ -205,7 +205,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Sqrt operator"); AddInput("X", "Input of Sqrt operator");
AddOutput("Y", "Output of Sqrt operator"); AddOutput("Y", "Output of Sqrt operator");
AddComment(R"DOC( AddComment(R"DOC(
Sqrt activation operator. Sqrt Activation Operator.
$y = \sqrt{x}$ $y = \sqrt{x}$
@ -220,7 +220,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Abs operator"); AddInput("X", "Input of Abs operator");
AddOutput("Y", "Output of Abs operator"); AddOutput("Y", "Output of Abs operator");
AddComment(R"DOC( AddComment(R"DOC(
Abs activation operator. Abs Activation Operator.
$y = |x|$ $y = |x|$
@ -236,7 +236,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Reciprocal operator"); AddInput("X", "Input of Reciprocal operator");
AddOutput("Y", "Output of Reciprocal operator"); AddOutput("Y", "Output of Reciprocal operator");
AddComment(R"DOC( AddComment(R"DOC(
Reciprocal activation operator. Reciprocal Activation Operator.
$$y = \frac{1}{x}$$ $$y = \frac{1}{x}$$
@ -251,7 +251,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Log operator"); AddInput("X", "Input of Log operator");
AddOutput("Y", "Output of Log operator"); AddOutput("Y", "Output of Log operator");
AddComment(R"DOC( AddComment(R"DOC(
Log activation operator. Log Activation Operator.
$y = \ln(x)$ $y = \ln(x)$
@ -268,7 +268,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Square operator"); AddInput("X", "Input of Square operator");
AddOutput("Y", "Output of Square operator"); AddOutput("Y", "Output of Square operator");
AddComment(R"DOC( AddComment(R"DOC(
Square activation operator. Square Activation Operator.
$y = x^2$ $y = x^2$
@ -284,7 +284,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Softplus operator"); AddInput("X", "Input of Softplus operator");
AddOutput("Y", "Output of Softplus operator"); AddOutput("Y", "Output of Softplus operator");
AddComment(R"DOC( AddComment(R"DOC(
Softplus activation operator. Softplus Activation Operator.
$y = \ln(1 + e^{x})$ $y = \ln(1 + e^{x})$
@ -300,7 +300,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Softsign operator"); AddInput("X", "Input of Softsign operator");
AddOutput("Y", "Output of Softsign operator"); AddOutput("Y", "Output of Softsign operator");
AddComment(R"DOC( AddComment(R"DOC(
Softsign activation operator. Softsign Activation Operator.
$$y = \frac{x}{1 + |x|}$$ $$y = \frac{x}{1 + |x|}$$
@ -320,7 +320,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("t_max", "The max marginal value of BRelu") AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
.SetDefault(static_cast<AttrType>(24)); .SetDefault(static_cast<AttrType>(24));
AddComment(R"DOC( AddComment(R"DOC(
BRelu activation operator. BRelu Activation Operator.
$y = \max(\min(x, t_{min}), t_{max})$ $y = \max(\min(x, t_{min}), t_{max})$
@ -339,7 +339,7 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("threshold", "The threshold value of SoftRelu") AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
.SetDefault(static_cast<AttrType>(40)); .SetDefault(static_cast<AttrType>(40));
AddComment(R"DOC( AddComment(R"DOC(
SoftRelu activation operator. SoftRelu Activation Operator.
$y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$ $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
@ -357,7 +357,7 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("alpha", "The alpha value of ELU") AddAttr<AttrType>("alpha", "The alpha value of ELU")
.SetDefault(static_cast<AttrType>(1.0f)); .SetDefault(static_cast<AttrType>(1.0f));
AddComment(R"DOC( AddComment(R"DOC(
ELU activation operator. ELU Activation Operator.
Applies the following element-wise computation on the input according to Applies the following element-wise computation on the input according to
https://arxiv.org/abs/1511.07289. https://arxiv.org/abs/1511.07289.
@ -378,7 +378,7 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("threshold", "The threshold value of Relu6") AddAttr<AttrType>("threshold", "The threshold value of Relu6")
.SetDefault(static_cast<AttrType>(6)); .SetDefault(static_cast<AttrType>(6));
AddComment(R"DOC( AddComment(R"DOC(
Relu6 activation operator. Relu6 Activation Operator.
$y = \min(\max(0, x), 6)$ $y = \min(\max(0, x), 6)$
@ -396,7 +396,7 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("factor", "The exponential factor of Pow") AddAttr<AttrType>("factor", "The exponential factor of Pow")
.SetDefault(static_cast<AttrType>(1)); .SetDefault(static_cast<AttrType>(1));
AddComment(R"DOC( AddComment(R"DOC(
Pow activation operator. Pow Activation Operator.
$y = x^{factor}$ $y = x^{factor}$
@ -416,7 +416,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("scale_b", "The scale parameter of b for the input") AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
.SetDefault(static_cast<AttrType>(1.7159)); .SetDefault(static_cast<AttrType>(1.7159));
AddComment(R"DOC( AddComment(R"DOC(
STanh activation operator. STanh Activation Operator.
$$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
@ -435,7 +435,7 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("threshold", "The threshold location of activation") AddAttr<AttrType>("threshold", "The threshold location of activation")
.SetDefault(static_cast<AttrType>(1.0)); .SetDefault(static_cast<AttrType>(1.0));
AddComment(R"DOC( AddComment(R"DOC(
ThresholdedRelu activation operator. ThresholdedRelu Activation Operator.
$$ $$
y = \begin{cases} y = \begin{cases}
@ -461,7 +461,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid") AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
.SetDefault(static_cast<AttrType>(0.5)); .SetDefault(static_cast<AttrType>(0.5));
AddComment(R"DOC( AddComment(R"DOC(
HardSigmoid activation operator. HardSigmoid Activation Operator.
Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
which is much faster than sigmoid. which is much faster than sigmoid.

@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter"); AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient"); AddInput("Grad", "(Tensor) Input gradient");
AddInput("AvgSquaredGrad", AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
"(Tensor) Input expectation of squared gradient");
AddInput("AvgSquaredUpdate", AddInput("AvgSquaredUpdate",
"(Tensor) Input expectation of squared parameter updates"); "(Tensor) Input average of squared parameter updates");
AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("AvgSquaredGradOut", AddOutput("AvgSquaredGradOut",
"(Tensor) Output expectation of squared gradient"); "(Tensor) Output average of squared gradient");
AddOutput("AvgSquaredUpdateOut", AddOutput("AvgSquaredUpdateOut",
"(Tensor) Output expectation of squared parameter updates"); "(Tensor) Output average of squared parameter updates");
AddAttr<float>("rho", AddAttr<float>("rho",
"(float, default 0.95) Exponential decay rate " "(float, default 0.95) Exponential decay rate "
@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
"numerical stability") "numerical stability")
.SetDefault(1.0e-6f); .SetDefault(1.0e-6f);
AddComment(R"DOC( AddComment(R"DOC(
Adadelta Updates Operator. Adadelta Optimizer.
This implements the Adadelta optimizer[1]. Adadelta is a per-dimension Adadelta optimizer is implemented as explained in:
adaptive learning rate method for gradient descent. https://arxiv.org/abs/1212.5701
Adadelta is a per-dimension adaptive learning rate method used
for gradient descent.
Adadelta updates: Adadelta updates are as follows:
avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad $$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
param_update = - sqrt((avg_squared_update + epsilon) / paramUpdate = - $\sqrt{((avgSquaredUpdate + \epsilon) /
(avg_squared_grad_out + epsilon)) * grad (avgSquaredGrad_out + \epsilon))}$ * grad \break
avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2 avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
param_out = param + param_update {(paramUpdate)}^2 \break
paramOut = param + paramUpdate$$
References:
[1] ADADELTA: An Adaptive Learning Rate Method
https://arxiv.org/abs/1212.5701
)DOC"); )DOC");
} }

@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
Adaptive Gradient Algorithm (Adagrad). Adaptive Gradient Algorithm (Adagrad).
moment_out = moment + grad * grad The update is done as follows:
param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
$$momentOut = moment + grad * grad \break
paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
$$
The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
does not have the epsilon attribute. It is added here for numerical stability does not have the epsilon attribute. It is added here in our implementation
by avoiding division by zero. as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
for numerical stability to avoid the division by zero error.
)DOC"); )DOC");
} }

@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension"); "Beta1 power accumulator should have 1 dimension");
auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension"); "Beta2 power accumulator should have 1 dimension");
auto param_dims = ctx->GetInputDim("Param"); auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
"Param and Grad input of AdamOp should have same dimension"); "Param and Grad input of AdamOp should have same dimension");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"), param_dims, ctx->GetInputDim("Moment1"),
"Param and Moment input of AdamOp should have same dimension"); "Param and Moment1 input of AdamOp should have same dimension");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment2"), param_dims, ctx->GetInputDim("Moment2"),
"Param and InfNorm input of AdamOp should have same dimension"); "Param and Moment2 input of AdamOp should have same dimension");
ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment1Out", param_dims);
@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(1.0e-8f); .SetDefault(1.0e-8f);
AddComment(R"DOC( AddComment(R"DOC(
Adam Updates Operator. Adam Optimizer.
This implements the Adam optimizer from Section 2 of the Adam This implements the Adam optimizer from Section 2 of the Adam
paper[1]. Adam is a first-order gradient-based optimization paper : https://arxiv.org/abs/1412.6980.
method based on adaptive estimates of lower-order moments. Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates: Adam updates:
moment1_out = beta1 * moment1 + (1 beta1) * grad $$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
moment2_out = beta2 * moment2 + (1 beta2) * grad * grad moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
learning_rate_t = learning_rate_t * learningRate = learningRate *
sqrt(1 - beta2_pow) / (1 - beta1_pow) $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon) paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
References:
[1] Adam: A Method for Stochastic Optimization
(https://arxiv.org/abs/1412.6980)
)DOC"); )DOC");
} }

@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
"Constant for numerical stability") "Constant for numerical stability")
.SetDefault(1.0e-8f); .SetDefault(1.0e-8f);
AddComment(R"DOC( AddComment(R"DOC(
Adamax Updates Operator. Adamax Optimizer.
This implements the Adamax optimizer from Section 7 of the Adam We implement the Adamax optimizer from Section 7 of the Adam
paper[1]. Adamax is a variant of the paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
Adam algorithm based on the infinity norm. Adam algorithm based on the infinity norm.
Adamax updates: Adamax updates:
moment_out = beta1 * moment + (1 - beta1) * grad $$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad)) infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
learning_rate_t = learning_rate/(1 - beta1_pow) learningRate = learningRate /(1 - \beta_1_{pow}) \break
param_out = param - learning_rate_t * moment_out/inf_norm_out paramOut = param - learningRate * momentPut / infNormOut$$
The original paper does not have an epsilon attribute. The original paper does not have an epsilon attribute.
However, it is added here for numerical stability However, it is added here for numerical stability to prevent the
by preventing divide by 0. division by 0 error.
References:
[1] Adam: A Method for Stochastic Optimization
(https://arxiv.org/abs/1412.6980)
)DOC"); )DOC");
} }

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save