Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-GRUOp-dev

mobile_baidu
guosheng 7 years ago
commit 7a1d5e9d6a

@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h"
@ -73,6 +74,8 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
var->GetMutable<std::vector<framework::Scope>>();
} else if (var_type == VarDesc::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else {
PADDLE_THROW(
"Variable type %d is not in "

@ -109,6 +109,11 @@ message LoDTensorDesc {
optional int32 lod_level = 2 [ default = 0 ];
}
message LoDTensorArrayDesc {
required TensorDesc tensor = 1;
optional int32 lod_level = 2 [ default = 0 ];
}
message VarDesc {
enum VarType {
LOD_TENSOR = 1;
@ -117,11 +122,13 @@ message VarDesc {
FETCH_LIST = 4;
STEP_SCOPES = 5;
LOD_RANK_TABLE = 6;
LOD_TENSOR_ARRAY = 7;
}
required string name = 1;
required VarType type = 2;
optional LoDTensorDesc lod_tensor = 3;
optional TensorDesc selected_rows = 4;
optional LoDTensorArrayDesc tensor_array = 6;
optional bool persistable = 5 [ default = false ];
}

@ -33,10 +33,15 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
item.length = vec[i + 1] - vec[i];
items_.emplace_back(item);
}
std::sort(items_.begin(), items_.end(),
[](const TableItem& a, const TableItem& b) {
return a.length > b.length;
});
// NOTE(yuyang18):
//
// The time complexity of stable_sort is O(N*log(N)) if additional memory is
// available. It is easy to debug and unit test when using `stable_sort`
// instead of `sort`. Also, the items of a rank table will not be too large.
std::stable_sort(items_.begin(), items_.end(),
[](const TableItem& a, const TableItem& b) {
return a.length > b.length;
});
}
} // namespace framework

@ -135,5 +135,43 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
ShareDataWith(Slice(begin, end));
}
void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
std::vector<std::vector<size_t>>* lod_length,
size_t* start_offset) {
lod_length->clear();
PADDLE_ENFORCE(start_idx < lod.size() - 1,
"start_idx should be >= 0 and < lod.size() - 1.");
PADDLE_ENFORCE(end_idx < lod.size(),
"end_idx should be >= 0 and < lod.size().");
PADDLE_ENFORCE_LE(start_idx, end_idx,
"start_idx should be less than end_idx.");
for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
std::vector<size_t> level_lens;
for (size_t i = start_idx; i < end_idx; ++i) {
level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
}
lod_length->emplace_back(level_lens);
start_idx = lod[level_idx][start_idx];
end_idx = lod[level_idx][end_idx];
}
*start_offset = start_idx;
}
void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
PADDLE_ENFORCE_EQ(
lod->size(), lod_length.size(),
"The lod_length should has the same size with the appended lod.");
for (size_t i = 0; i < lod->size(); ++i) {
auto& level = (*lod)[i];
if (level.empty()) {
level.push_back(0);
}
for (size_t len : lod_length[i]) {
level.push_back(level.back() + len);
}
}
}
} // namespace framework
} // namespace paddle

@ -181,5 +181,11 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
return tensor;
}
void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
std::vector<std::vector<size_t>>* lod_length,
size_t* start_offset);
void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
} // namespace framework
} // namespace paddle

@ -0,0 +1,23 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/framework/lod_tensor.h"
namespace paddle {
namespace framework {
using LoDTensorArray = std::vector<LoDTensor>;
}
} // namespace paddle

@ -144,5 +144,47 @@ TEST(LodExpand, test) {
}
}
TEST(LoD, GetFineGrainedLoDLength) {
LoD lod;
lod.push_back(std::vector<size_t>{0, 2, 4, 5});
lod.push_back(std::vector<size_t>{0, 1, 6, 8, 10, 11});
lod.push_back(
std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
std::vector<std::vector<size_t>> lod_length;
size_t start_offset;
paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
&start_offset);
std::vector<std::vector<size_t>> expected;
expected.push_back(std::vector<size_t>{2});
expected.push_back(std::vector<size_t>{2, 2});
expected.push_back(std::vector<size_t>{2, 3, 4, 2});
EXPECT_EQ(lod_length, expected);
EXPECT_EQ(start_offset, 15UL);
}
TEST(LoD, AppendLoD) {
std::vector<std::vector<size_t>> lod_lens;
lod_lens.push_back(std::vector<size_t>{2});
lod_lens.push_back(std::vector<size_t>{2, 2});
lod_lens.push_back(std::vector<size_t>{2, 3, 4, 2});
LoD origin;
origin.push_back(std::vector<size_t>{0, 2});
origin.push_back(std::vector<size_t>{0, 1, 6});
origin.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15});
paddle::framework::AppendLoD(&origin, lod_lens);
LoD expected;
expected.push_back(std::vector<size_t>{0, 2, 4});
expected.push_back(std::vector<size_t>{0, 1, 6, 8, 10});
expected.push_back(
std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
EXPECT_EQ(origin, expected);
}
} // namespace framework
} // namespace paddle

@ -37,13 +37,27 @@ std::vector<int64_t> VarDescBind::Shape() const {
DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
void VarDescBind::SetLoDLevel(int32_t lod_level) {
PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
desc_.mutable_lod_tensor()->set_lod_level(lod_level);
switch (desc_.type()) {
case VarDesc::LOD_TENSOR:
desc_.mutable_lod_tensor()->set_lod_level(lod_level);
break;
case VarDesc::LOD_TENSOR_ARRAY:
desc_.mutable_tensor_array()->set_lod_level(lod_level);
break;
default:
PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
}
}
int32_t VarDescBind::GetLodLevel() const {
PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
return desc_.lod_tensor().lod_level();
switch (desc_.type()) {
case VarDesc::LOD_TENSOR:
return desc_.lod_tensor().lod_level();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().lod_level();
default:
PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
}
}
const TensorDesc &VarDescBind::tensor_desc() const {
@ -53,6 +67,8 @@ const TensorDesc &VarDescBind::tensor_desc() const {
return desc_.selected_rows();
case VarDesc::LOD_TENSOR:
return desc_.lod_tensor().tensor();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().tensor();
default:
PADDLE_THROW("Unexpected branch.");
}
@ -66,6 +82,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() {
return desc_.mutable_selected_rows();
case VarDesc::LOD_TENSOR:
return desc_.mutable_lod_tensor()->mutable_tensor();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.mutable_tensor_array()->mutable_tensor();
default:
PADDLE_THROW("Unexpected branch.");
}

@ -0,0 +1,154 @@
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNAddtoLayer.h"
using namespace mkldnn; // NOLINT
namespace paddle {
REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
return false;
}
layerSize_ = getSize();
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
}
if (biasParameter_.get() != NULL) {
biases_ =
std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
}
return true;
}
void MKLDNNAddtoLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
}
oc = ic;
oh = ih;
ow = iw;
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
printSizeInfo();
}
void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
if (biases_) {
LOG(FATAL) << "not implemented yet";
}
resetFwdBuffers(inVals_, out);
in = inVals_[0];
std::shared_ptr<sum::primitive_desc> fwdPD;
resetFwdPD(fwdPD, inVals_, out);
resetFwdPipeline(pipeline, fwdPD, inVals_, out);
}
void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetBwdBuffers(inGrads_, out);
in = inGrads_[0];
// backward only need share output grad to input grad
for (size_t i = 0; i < inGrads_.size(); i++) {
if (inGrads_[i] != nullptr) {
inGrads_[i] = out;
inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
}
}
}
void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
if (biases_ && biases_->getWGrad()) {
biases_->getParameterPtr()->incUpdate(callback);
}
}
void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
resetInValue(inputs[i], nullptr, i);
CHECK(inputs[i]);
inputs[i]->downSpatial();
}
for (size_t i = 1; i < inputs.size(); i++) {
CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
}
resetOutValue(out, inputs[0]->getPrimitiveDesc());
}
void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out) {
std::vector<double> scales(inputs.size(), 1.0);
std::vector<memory::primitive_desc> srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
}
CHECK(out);
pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
}
void MKLDNNAddtoLayer::resetFwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::vector<primitive::at> srcs;
for (size_t i = 0; i < inputs.size(); i++) {
srcs.push_back(*(inputs[i]));
}
fwd_.reset(new sum(*pd, srcs, *out));
pipeline.push_back(*fwd_);
}
void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
CHECK(out);
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
}
}
} // namespace paddle

@ -0,0 +1,110 @@
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
/**
* @brief A subclass of MKLDNNLayer Addto layer.
*
* The config file api is mkldnn_addto
*/
class MKLDNNAddtoLayer : public MKLDNNLayer {
protected:
std::vector<MKLDNNMatrixPtr> inVals_;
std::vector<MKLDNNMatrixPtr> inGrads_;
// layer size == ic * ih * iw == oc * oh *ow, and can not be changed
size_t layerSize_;
// TODO(TJ): this part has not been optimized by MKL-DNN
std::unique_ptr<Weight> biases_;
public:
explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
~MKLDNNAddtoLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override;
void printValueFormat() override {
for (size_t i = 0; i < inVals_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
}
void printGradFormat() override {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
for (size_t i = 0; i < inGrads_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
}
}
protected:
/**
* Forward functions: reset buffers(inputs, output, bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
*/
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
};
} // namespace paddle

@ -77,7 +77,7 @@ void MKLDNNLayer::forward(PassType passType) {
needResetBwd_ = true;
}
if (inputLayers_[0]->getType() == "data") {
if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
// Update input value data when input layer is "data" type,
// since the input value data address might be changed.
CHECK(extInVal_);
@ -171,14 +171,16 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
}
void MKLDNNLayer::resetInValue(
MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
MKLDNNMatrixPtr& in,
const std::shared_ptr<memory::primitive_desc>& intPD,
size_t inputIdx) {
cvtInVal_ = nullptr;
extInVal_ = nullptr;
in = nullptr;
CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
auto extPD = MKLDNNMatrix::createPrimitiveDesc(
{bs_, ic_, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
if (in == nullptr || in->getFormat() == format::nc) {
@ -216,11 +218,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
}
void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
memory::primitive_desc intPD) {
memory::primitive_desc intPD,
size_t inputIdx) {
cvtInGrad_ = nullptr;
extInGrad_ = nullptr;
in = nullptr;
LayerPtr& input = inputLayers_[0];
LayerPtr& input = inputLayers_[inputIdx];
if (input->getOutputGrad() == nullptr) {
// no need input grad
return;
@ -245,7 +248,6 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
return;
}
// need create reorder
// TODO(TJ): add macro definition to simplify it
CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
<< "should have external input value and the format must be nchw(nc)";
extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);

@ -199,7 +199,8 @@ protected:
*/
void resetInValue(
MKLDNNMatrixPtr& in,
const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
size_t inputIdx = 0);
/**
* reset output value from internal primitive desc.
@ -212,7 +213,9 @@ protected:
* reset input grad from internal primitive desc.
* reset both internal and external buffer and create reorder if necessary.
*/
void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
void resetInGrad(MKLDNNMatrixPtr& in,
mkldnn::memory::primitive_desc intPD,
size_t inputIdx = 0);
/**
* reset output grad from internal primitive desc.

@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() {
VLOG(MKLDNN_TESTS) << "Check Forward";
printTopDatas();
double delta =
compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
EXPECT_LE(fabs(delta), eps_);
}
@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() {
VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
printMatrix(refDiff);
double delta = compareMatrix(dnnDiff, refDiff);
double delta = compareMatrix(refDiff, dnnDiff);
EXPECT_LE(fabs(delta), eps_);
if (isBN) {
// the other two inputs in batch norm are for moving mean and var
@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() {
<< parameters_[REF][i]->getName();
printVector(ref);
double delta = compareVector(dnn, ref);
double delta = compareVector(ref, dnn);
EXPECT_LE(fabs(delta), eps_);
}

@ -271,20 +271,53 @@ TEST(MKLDNNLayer, BatchNormLayer) {
testBatchNormLayer({16, 32, 16, 16});
}
struct testActDesc {
struct testImageDesc {
int bs, ic, ih, iw;
};
static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
static void getAddtoConfig(TestConfig& cfg,
const testImageDesc& pm,
const size_t nInputs = 1) {
cfg.biasSize = 0;
cfg.layerConfig.set_type("addto");
size_t layerSize = pm.ic * pm.ih * pm.iw;
cfg.layerConfig.set_size(layerSize);
cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
cfg.layerConfig.add_inputs();
cfg.layerConfig.set_active_type("relu");
for (size_t i = 0; i < nInputs; ++i) {
std::stringstream ss;
ss << "layer_" << i;
cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
ImageConfig* img_conf = input->mutable_image_conf();
img_conf->set_channels(pm.ic);
img_conf->set_img_size_y(pm.ih);
img_conf->set_img_size(pm.iw);
}
}
void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
CHECK_GE(nInputs, 1);
TestConfig dnnConfig;
getAddtoConfig(dnnConfig, pm, nInputs);
dnnConfig.layerConfig.set_type("mkldnn_addto");
// TODO(TJ): test with bias
for (auto withBias : {false}) {
if (withBias) {
dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
} else {
dnnConfig.biasSize = 0;
}
RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
}
}
TEST(MKLDNNLayer, AddtoLayer) {
testAddtoLayer({16, 5, 14, 14}, 1);
testAddtoLayer({8, 10, 8, 8}, 2);
testAddtoLayer({4, 12, 1, 1}, 3);
}
void testActivation(std::string actType, const testActDesc& pm) {
void testActivation(std::string actType, const testImageDesc& pm) {
// TODO(TJ): remove me when paddle support elu activation
if (actType == "mkldnn_elu") {
return;

@ -44,7 +44,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Sigmoid operator");
AddOutput("Y", "Output of Sigmoid operator");
AddComment(R"DOC(
Sigmoid activation operator.
Sigmoid Activation Operator.
$y = 1 / (1 + e^{-x})$
@ -60,7 +60,7 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of LogSigmoid operator");
AddOutput("Y", "Output of LogSigmoid operator");
AddComment(R"DOC(
Logsigmoid activation operator.
Logsigmoid Activation Operator.
$y = \log(1 / (1 + e^{-x}))$
@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Exp operator");
AddOutput("Y", "Output of Exp operator");
AddComment(R"DOC(
Exp activation operator.
Exp Activation Operator.
$y = e^x$
@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Relu operator");
AddOutput("Y", "Output of Relu operator");
AddComment(R"DOC(
Relu activation operator.
Relu Activation Operator.
$y = \max(x, 0)$
@ -109,7 +109,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("alpha", "The small negative slope")
.SetDefault(static_cast<AttrType>(0.02f));
AddComment(R"DOC(
LeakyRelu activation operator.
LeakyRelu Activation Operator.
$y = \max(x, \alpha * x)$
@ -128,7 +128,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("lambda", "non-negative offset")
.SetDefault(static_cast<AttrType>(0.5f));
AddComment(R"DOC(
Softshrink activation operator.
Softshrink Activation Operator.
$$
y = \begin{cases}
@ -149,7 +149,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Tanh operator");
AddOutput("Y", "Output of Tanh operator");
AddComment(R"DOC(
Tanh activation operator.
Tanh Activation Operator.
$$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
@ -165,7 +165,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of TanhShrink operator");
AddOutput("Y", "Output of TanhShrink operator");
AddComment(R"DOC(
TanhShrink activation operator.
TanhShrink Activation Operator.
$$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
@ -184,7 +184,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
.SetDefault(static_cast<AttrType>(0.5));
AddComment(R"DOC(
HardShrink activation operator.
HardShrink Activation Operator.
$$
y = \begin{cases}
@ -205,7 +205,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Sqrt operator");
AddOutput("Y", "Output of Sqrt operator");
AddComment(R"DOC(
Sqrt activation operator.
Sqrt Activation Operator.
$y = \sqrt{x}$
@ -220,7 +220,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Abs operator");
AddOutput("Y", "Output of Abs operator");
AddComment(R"DOC(
Abs activation operator.
Abs Activation Operator.
$y = |x|$
@ -236,7 +236,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Reciprocal operator");
AddOutput("Y", "Output of Reciprocal operator");
AddComment(R"DOC(
Reciprocal activation operator.
Reciprocal Activation Operator.
$$y = \frac{1}{x}$$
@ -251,7 +251,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Log operator");
AddOutput("Y", "Output of Log operator");
AddComment(R"DOC(
Log activation operator.
Log Activation Operator.
$y = \ln(x)$
@ -268,7 +268,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Square operator");
AddOutput("Y", "Output of Square operator");
AddComment(R"DOC(
Square activation operator.
Square Activation Operator.
$y = x^2$
@ -284,7 +284,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Softplus operator");
AddOutput("Y", "Output of Softplus operator");
AddComment(R"DOC(
Softplus activation operator.
Softplus Activation Operator.
$y = \ln(1 + e^{x})$
@ -300,7 +300,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Softsign operator");
AddOutput("Y", "Output of Softsign operator");
AddComment(R"DOC(
Softsign activation operator.
Softsign Activation Operator.
$$y = \frac{x}{1 + |x|}$$
@ -320,7 +320,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
.SetDefault(static_cast<AttrType>(24));
AddComment(R"DOC(
BRelu activation operator.
BRelu Activation Operator.
$y = \max(\min(x, t_{min}), t_{max})$
@ -339,7 +339,7 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
.SetDefault(static_cast<AttrType>(40));
AddComment(R"DOC(
SoftRelu activation operator.
SoftRelu Activation Operator.
$y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
@ -357,7 +357,7 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("alpha", "The alpha value of ELU")
.SetDefault(static_cast<AttrType>(1.0f));
AddComment(R"DOC(
ELU activation operator.
ELU Activation Operator.
Applies the following element-wise computation on the input according to
https://arxiv.org/abs/1511.07289.
@ -378,7 +378,7 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("threshold", "The threshold value of Relu6")
.SetDefault(static_cast<AttrType>(6));
AddComment(R"DOC(
Relu6 activation operator.
Relu6 Activation Operator.
$y = \min(\max(0, x), 6)$
@ -396,7 +396,7 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("factor", "The exponential factor of Pow")
.SetDefault(static_cast<AttrType>(1));
AddComment(R"DOC(
Pow activation operator.
Pow Activation Operator.
$y = x^{factor}$
@ -416,7 +416,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
.SetDefault(static_cast<AttrType>(1.7159));
AddComment(R"DOC(
STanh activation operator.
STanh Activation Operator.
$$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
@ -435,7 +435,7 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("threshold", "The threshold location of activation")
.SetDefault(static_cast<AttrType>(1.0));
AddComment(R"DOC(
ThresholdedRelu activation operator.
ThresholdedRelu Activation Operator.
$$
y = \begin{cases}
@ -461,7 +461,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
.SetDefault(static_cast<AttrType>(0.5));
AddComment(R"DOC(
HardSigmoid activation operator.
HardSigmoid Activation Operator.
Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
which is much faster than sigmoid.

@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient");
AddInput("AvgSquaredGrad",
"(Tensor) Input expectation of squared gradient");
AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
AddInput("AvgSquaredUpdate",
"(Tensor) Input expectation of squared parameter updates");
"(Tensor) Input average of squared parameter updates");
AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("AvgSquaredGradOut",
"(Tensor) Output expectation of squared gradient");
"(Tensor) Output average of squared gradient");
AddOutput("AvgSquaredUpdateOut",
"(Tensor) Output expectation of squared parameter updates");
"(Tensor) Output average of squared parameter updates");
AddAttr<float>("rho",
"(float, default 0.95) Exponential decay rate "
@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
"numerical stability")
.SetDefault(1.0e-6f);
AddComment(R"DOC(
Adadelta Updates Operator.
Adadelta Optimizer.
This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
adaptive learning rate method for gradient descent.
Adadelta optimizer is implemented as explained in:
https://arxiv.org/abs/1212.5701
Adadelta is a per-dimension adaptive learning rate method used
for gradient descent.
Adadelta updates:
Adadelta updates are as follows:
avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
param_update = - sqrt((avg_squared_update + epsilon) /
(avg_squared_grad_out + epsilon)) * grad
avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
param_out = param + param_update
References:
[1] ADADELTA: An Adaptive Learning Rate Method
https://arxiv.org/abs/1212.5701
$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
paramUpdate = - $\sqrt{((avgSquaredUpdate + \epsilon) /
(avgSquaredGrad_out + \epsilon))}$ * grad \break
avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
{(paramUpdate)}^2 \break
paramOut = param + paramUpdate$$
)DOC");
}

@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
Adaptive Gradient Algorithm (Adagrad).
moment_out = moment + grad * grad
param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
The update is done as follows:
$$momentOut = moment + grad * grad \break
paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
$$
The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
does not have the epsilon attribute. It is added here for numerical stability
by avoiding division by zero.
does not have the epsilon attribute. It is added here in our implementation
as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
for numerical stability to avoid the division by zero error.
)DOC");
}

@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension");
auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension");
PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
"Beta2 power accumulator should have 1 dimension");
auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
"Param and Grad input of AdamOp should have same dimension");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"),
"Param and Moment input of AdamOp should have same dimension");
"Param and Moment1 input of AdamOp should have same dimension");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment2"),
"Param and InfNorm input of AdamOp should have same dimension");
"Param and Moment2 input of AdamOp should have same dimension");
ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("Moment1Out", param_dims);
@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(1.0e-8f);
AddComment(R"DOC(
Adam Updates Operator.
Adam Optimizer.
This implements the Adam optimizer from Section 2 of the Adam
paper[1]. Adam is a first-order gradient-based optimization
method based on adaptive estimates of lower-order moments.
paper : https://arxiv.org/abs/1412.6980.
Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates:
moment1_out = beta1 * moment1 + (1 beta1) * grad
moment2_out = beta2 * moment2 + (1 beta2) * grad * grad
learning_rate_t = learning_rate_t *
sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
References:
[1] Adam: A Method for Stochastic Optimization
(https://arxiv.org/abs/1412.6980)
$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
learningRate = learningRate *
$\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
)DOC");
}

@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
"Constant for numerical stability")
.SetDefault(1.0e-8f);
AddComment(R"DOC(
Adamax Updates Operator.
Adamax Optimizer.
This implements the Adamax optimizer from Section 7 of the Adam
paper[1]. Adamax is a variant of the
We implement the Adamax optimizer from Section 7 of the Adam
paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
Adam algorithm based on the infinity norm.
Adamax updates:
moment_out = beta1 * moment + (1 - beta1) * grad
inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
learning_rate_t = learning_rate/(1 - beta1_pow)
param_out = param - learning_rate_t * moment_out/inf_norm_out
$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
learningRate = learningRate /(1 - \beta_1_{pow}) \break
paramOut = param - learningRate * momentPut / infNormOut$$
The original paper does not have an epsilon attribute.
However, it is added here for numerical stability
by preventing divide by 0.
References:
[1] Adam: A Method for Stochastic Optimization
(https://arxiv.org/abs/1412.6980)
However, it is added here for numerical stability to prevent the
division by 0 error.
)DOC");
}

@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Indices"),
"Input of Indices must be initialized.");
"Input of Indices should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"),
"Input of Label must be initialized.");
"Input of Label should not be null.");
auto inference_height = ctx->GetInputDim("Out")[0];
auto label_height = ctx->GetInputDim("Label")[0];
@ -52,20 +52,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Out",
"A floating point 2D tensor, values are in the range [0, 1]."
"Each row is descend sorted. This input should be the"
"Each row is sorted in descending order. This input should be the"
"output of topk."
"Typically, this tensor indicates the probability of each label");
AddInput("Indices",
"An int 2D tensor, indicating the indices of original"
"tensor before sort. Typically, this tensor indicates which label"
"the probability stands for.");
"tensor before sorting. Typically, this tensor indicates which "
"label the probability stands for.");
AddInput("Label",
"A 2D int tensor indicating the label of the training data."
"The height is batch size and width is always 1.");
// TODO(typhoonzero): support weight input
AddOutput("AUC",
"A scalar representing the "
"current area-under-curve.");
"current area-under-the-curve.");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC");
@ -74,19 +74,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
" roc curve.")
.SetDefault(200);
AddComment(
R"DOC(Computes the AUC according forward output and label.
Best to use for binary classification evaluations.
AddComment(R"DOC(
Area Under The Curve (AUC) Operator.
This implementation computes the AUC according to forward output and label.
It is used very widely in binary classification evaluation. As a note:
If input label contains values other than 0 and 1, it will be cast
to bool.
You can find the definations here:
to bool. You can find the relevant definitions here:
https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
Possible curves are:
- ROC: Receiver operating characteristic
- PR: Precision Recall
There are two types of possible curves:
1. ROC: Receiver operating characteristic
2. PR: Precision Recall
)DOC");
}
};

@ -70,7 +70,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
: x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"Input x must have 3 to 5 dimensions.");
"Input X must have 3 to 5 dimensions.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
@ -97,16 +97,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "The input tensor");
AddInput("Scale",
"Scale is a 1-dimensional tensor of size C "
"to be applied to the output");
"that is applied to the output");
AddInput("Bias",
"Bias is a 1-dimensional tensor of size C "
"to be applied to the output");
"that is applied to the output");
AddInput("Mean",
"The global mean (for training) or the "
"The global mean (for training) or "
"estimated mean (for testing)");
AddInput("Variance",
"The global variance (for training) "
"or the estimated Variance (for testing)");
"or estimated Variance (for testing)");
AddOutput("Y", "result after normalization");
AddOutput("MeanOut",
"Share memory with Mean. "
@ -123,10 +123,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
"will apply to output when training")
.AsIntermediate();
AddComment(R"DOC(
https://arxiv.org/pdf/1502.03167.pdf
Batch Normalization.
NHWC `[batch, in_height, in_width, in_channels]`
NCHW `[batch, in_channels, in_height, in_width]`
Batch Norm has been implemented as discussed in the paper:
https://arxiv.org/pdf/1502.03167.pdf
Can be used as a normalizer function for conv2d and fully_connected operations.
The required data format for this layer is one of the following:
1. NHWC `[batch, in_height, in_width, in_channels]`
2. NCHW `[batch, in_channels, in_height, in_width]`
)DOC");
}

@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
CastOpProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "the input tensor of cast op");
AddOutput("Out", "the output tensor of cast op");
AddComment(R"DOC(Cast operator.
cast the input tensor to other data type.
)DOC");
AddInput("X", "The input tensor of cast op");
AddOutput("Out", "The output tensor of cast op");
AddAttr<int>("out_data_type", "output data type");
AddAttr<int>("in_data_type", "input data type");
AddComment(R"DOC(
Cast Operator.
This Operator casts the input tensor to another data type and
returns tha Output Tensor.
)DOC");
}
};

@ -49,8 +49,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>(
"max", "(float)Maximum value, above which element is replaced by max");
AddComment(R"DOC(
Clip operator limits the given input within an interval. The interval is
Clip Operator.
The clip operator limits the value of given input within an interval. The interval is
specified with arguments 'min' and 'max'.
)DOC");
}
};

@ -23,21 +23,21 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Emission",
"(LoDTensor, default: LoDTensor<float>). "
"A 2-D LoDTensor with shape [N x D] where N is the size of the "
"(LoDTensor, default LoDTensor<float>) "
"A 2-D LoDTensor with shape [N x D], where N is the size of the "
"mini-batch and D is the total tag number. The unscaled emission "
"weight matrix for the linear chain CRF. ");
AddInput("Transition",
"(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
"(Tensor, default Tensor<float>) A 2-D Tensor with shape "
"[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
"operator. See more details in the operator's comments.");
AddInput("Label",
"(LoDTensor, default: LoDTensor<int>). A LoDTensor with shape "
"(LoDTensor, default LoDTensor<int>) A LoDTensor with shape "
"[N x 1], where N is the total element number in a mini-batch. "
"The ground truth.");
AddOutput(
"Alpha",
"(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
"(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
"The forward vectors for the entire batch. Denote it as \f$\alpha\f$. "
"\f$\alpha$\f is a memo table used to calculate the normalization "
"factor in CRF. \f$\alpha[k, v]$\f stores the unnormalized "
@ -49,26 +49,28 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
.AsIntermediate();
AddOutput(
"EmissionExps",
"(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
"(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
"The exponentials of Input(Emission). This is an intermediate "
"computational result in forward computation, and will be reused in "
"backward computation.")
.AsIntermediate();
AddOutput(
"TransitionExps",
"(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
"(Tensor, default Tensor<float>) A 2-D Tensor with shape "
"[(D + 2) x D]. The exponentials of Input(Transition). This is an "
"intermediate computational result in forward computation, and "
"will be reused in backward computation.")
.AsIntermediate();
AddOutput(
"LogLikelihood",
"(Tensor, default: Tensor<float>). The logarithm of the conditional "
"(Tensor, default Tensor<float>) The logarithm of the conditional "
"likelihood of each training sample in a mini-batch. This is a 2-D "
"tensor with shape [S x 1], where S is the sequence number in a "
"mini-batch. Note: S is equal to the sequence number in a mini-batch. "
"The output is no longer a LoDTensor.");
AddComment(R"DOC(
LinearChainCRF Operator.
Conditional Random Field defines an undirected probabilistic graph with nodes
denoting random variables and edges denoting dependencies between these
variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
@ -82,29 +84,28 @@ and output must be linear sequences. Thus, the graph of such a CRF is a simple
chain or a line, which results in the linear chain CRF.
This operator implements the Forward-Backward algorithm for the linear chain
CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf and
http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for reference.
CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
Equation:
- Denote Input(Emission) to this operator as \f$x\f$ here.
- The first D values of Input(Transition) to this operator are for starting
1. Denote Input(Emission) to this operator as \f$x\f$ here.
2. The first D values of Input(Transition) to this operator are for starting
weights, denoted as \f$a\f$ here.
- The next D values of Input(Transition) of this operator are for ending
3. The next D values of Input(Transition) of this operator are for ending
weights, denoted as \f$b\f$ here.
- The remaning values of Input(Transition) are for transition weights,
4. The remaning values of Input(Transition) are for transition weights,
denoted as \f$w\f$ here.
- Denote Input(Label) as \f$s\f$ here.
5. Denote Input(Label) as \f$s\f$ here.
The probability of a sequence \f$s\f$ of length \f$L\f$ is defined as:
\f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
\f$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+ \sum_{l=1}^L x_{s_l}
+ \sum_{l=2}^L w_{s_{l-1},s_l})\f$
where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
all possible sequences is \f$1\f$, and \f$x\f$ is the emission feature weight
to the linear chain CRF.
Finaly, the linear chain CRF operator outputs the logarithm of the conditional
Finally, the linear chain CRF operator outputs the logarithm of the conditional
likelihood of each training sample in a mini-batch.
NOTE:

@ -55,8 +55,6 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
"(2-D tensor with shape [batch_size x 1]) "
"The label indicating X1 ranked higher than X2 or not, "
"can only be +1 or -1.");
AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
.SetDefault(static_cast<T>(0));
AddOutput("Activated",
"(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
"to indicate whether each element of Output(Out) is activated.")
@ -64,23 +62,26 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out",
"(2-D tensor with shape [batch_size x 1]) "
"The output loss of MarginRankLoss operator.");
AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
.SetDefault(static_cast<T>(0));
AddComment(R"DOC(
MarginRankLoss Operator.
MarginRankLoss operator measures the loss given a pair of training sample
This operator measures the loss given a pair of training sample
{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1`
indicating X1 is ranked higher than `X2`, otherwise `Label = -1`. The loss
turns out
indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss
is calculated as:
loss(X1, X2, Label) = max(0, -Label * (X1 - X2) + margin).
$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
The attribute `margin` involved here helps make the predictions more robust.
The attribute `margin` here helps make the predictions more robust.
Denote the item ranked higher as the positive sample, otherwise the negative
sample. If the score of the two samples satisfies
positive sample - negative sample < margin,
$positive sample - negative sample < margin$
the pair of samples will contribute to the final loss, which will backpropogate
and train the ranking model to enlarge the difference of the two score.
the pair of samples will contribute to the final loss, which will backpropagate
and train the ranking model to enlarge the difference between the two scores.
For batch input with size `batch_size`, `X1`, `X2` and `Label`
all have the same shape [batch_size x 1].

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save