From 65969dad641a95a1ac0f744b11c1166a173d169b Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 16 Jun 2017 16:29:08 +0800 Subject: [PATCH 01/79] Add DetectionOutputLayer and MultiBoxLossLayer. --- .../gserver/layers/DetectionOutputLayer.cpp | 154 ++++++++ paddle/gserver/layers/DetectionOutputLayer.h | 81 ++++ paddle/gserver/layers/MultiBoxLossLayer.cpp | 365 ++++++++++++++++++ paddle/gserver/layers/MultiBoxLossLayer.h | 103 +++++ paddle/gserver/tests/CMakeLists.txt | 7 + paddle/gserver/tests/LayerGradUtil.cpp | 25 ++ paddle/gserver/tests/LayerGradUtil.h | 18 +- paddle/gserver/tests/test_DetectionOutput.cpp | 191 +++++++++ paddle/gserver/tests/test_LayerGrad.cpp | 64 +++ proto/ModelConfig.proto | 25 ++ python/paddle/trainer/config_parser.py | 46 +++ .../paddle/trainer_config_helpers/layers.py | 161 ++++++++ 12 files changed, 1239 insertions(+), 1 deletion(-) create mode 100644 paddle/gserver/layers/DetectionOutputLayer.cpp create mode 100644 paddle/gserver/layers/DetectionOutputLayer.h create mode 100644 paddle/gserver/layers/MultiBoxLossLayer.cpp create mode 100644 paddle/gserver/layers/MultiBoxLossLayer.h create mode 100644 paddle/gserver/tests/test_DetectionOutput.cpp diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp new file mode 100644 index 0000000000..2a4d7f8b5b --- /dev/null +++ b/paddle/gserver/layers/DetectionOutputLayer.cpp @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "DetectionOutputLayer.h" + +namespace paddle { + +REGISTER_LAYER(detection_output, DetectionOutputLayer); + +bool DetectionOutputLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + auto& layerConf = config_.inputs(0).detection_output_conf(); + numClasses_ = layerConf.num_classes(); + inputNum_ = layerConf.input_num(); + nmsThreshold_ = layerConf.nms_threshold(); + confidenceThreshold_ = layerConf.confidence_threshold(); + nmsTopK_ = layerConf.nms_top_k(); + keepTopK_ = layerConf.keep_top_k(); + backgroundId_ = layerConf.background_id(); + return true; +} + +void DetectionOutputLayer::forward(PassType passType) { + Layer::forward(passType); + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + + locSizeSum_ = 0; + confSizeSum_ = 0; + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + locSizeSum_ += inLoc->getElementCnt(); + confSizeSum_ += inConf->getElementCnt(); + } + + Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_); + Matrix::resizeOrCreate( + confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_); + locBuffer_ = locTmpBuffer_; + confBuffer_ = confTmpBuffer_; + + size_t locOffset = 0; + size_t confOffset = 0; + auto& layerConf = config_.inputs(0).detection_output_conf(); + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + locOffset += appendWithPermute(*inLoc, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locBuffer_, + kNCHWToNHWC); + confOffset += appendWithPermute(*inConf, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confBuffer_, + kNCHWToNHWC); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); + + MatrixPtr priorValue; + if (useGpu_) { + Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false); + Matrix::resizeOrCreate( + confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false); + MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer()); + Matrix::resizeOrCreate( + priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false); + + locCpuBuffer_->copyFrom(*locTmpBuffer_); + confCpuBuffer_->copyFrom(*confTmpBuffer_); + priorCpuValue_->copyFrom(*priorTmpValue); + + locBuffer_ = locCpuBuffer_; + confBuffer_ = confCpuBuffer_; + priorValue = priorCpuValue_; + } else { + priorValue = getInputValue(*getPriorBoxLayer()); + } + confBuffer_->softmax(*confBuffer_); + + size_t numPriors = priorValue->getElementCnt() / 8; + vector> allDecodedBBoxes; + for (size_t n = 0; n < batchSize; ++n) { + vector decodedBBoxes; + for (size_t i = 0; i < numPriors; ++i) { + size_t priorOffset = i * 8; + size_t locPredOffset = n * numPriors * 4 + i * 4; + vector priorBBoxVec; + getBBoxFromPriorData( + priorValue->getData() + priorOffset, 1, priorBBoxVec); + vector> priorBBoxVar; + getBBoxVarFromPriorData( + priorValue->getData() + priorOffset, 1, priorBBoxVar); + vector locPredData; + for (size_t j = 0; j < 4; ++j) + locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j)); + NormalizedBBox bbox = + decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData); + decodedBBoxes.push_back(bbox); + } + allDecodedBBoxes.push_back(decodedBBoxes); + } + + vector>> allIndices; + size_t numKept = getDetectionIndices(confBuffer_->getData(), + numPriors, + numClasses_, + backgroundId_, + batchSize, + confidenceThreshold_, + nmsTopK_, + nmsThreshold_, + keepTopK_, + allDecodedBBoxes, + &allIndices); + + resetOutput(numKept, 7); + MatrixPtr outV = getOutputValue(); + getDetectionOutput(confBuffer_->getData(), + numKept, + numPriors, + numClasses_, + batchSize, + allIndices, + allDecodedBBoxes, + *outV); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h new file mode 100644 index 0000000000..38271cb054 --- /dev/null +++ b/paddle/gserver/layers/DetectionOutputLayer.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "DetectionUtil.h" +#include "Layer.h" + +using std::vector; +using std::map; +using std::pair; + +namespace paddle { + +/** + * The detection output layer for a SSD detection task. This layer apply the + * Non-maximum suppression to the all predicted bounding box and keep the + * Top-K bounding boxes. + * - Input: This layer need three input layers: This first input layer + * is the priorbox layer. The rest two input layers are convolution + * layers for generating bbox location offset and the classification + * confidence. + * - Output: The predict bounding box location. + */ + +class DetectionOutputLayer : public Layer { +public: + explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + + void backward(const UpdateCallback& callback = nullptr) {} + +protected: + inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; } + + inline LayerPtr getLocInputLayer(size_t index) { + return inputLayers_[1 + index]; + } + + inline LayerPtr getConfInputLayer(size_t index) { + return inputLayers_[1 + inputNum_ + index]; + } + +private: + size_t numClasses_; // number of classes + size_t inputNum_; // number of input layers + real nmsThreshold_; + real confidenceThreshold_; + size_t nmsTopK_; + size_t keepTopK_; + size_t backgroundId_; + + size_t locSizeSum_; + size_t confSizeSum_; + + MatrixPtr locBuffer_; + MatrixPtr confBuffer_; + MatrixPtr locTmpBuffer_; + MatrixPtr confTmpBuffer_; + MatrixPtr priorCpuValue_; + MatrixPtr locCpuBuffer_; + MatrixPtr confCpuBuffer_; +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp new file mode 100644 index 0000000000..27a2cc3fa4 --- /dev/null +++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp @@ -0,0 +1,365 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MultiBoxLossLayer.h" +#include +#include +#include "DataLayer.h" + +using std::vector; +using std::map; +using std::pair; + +namespace paddle { + +REGISTER_LAYER(multibox_loss, MultiBoxLossLayer); + +bool MultiBoxLossLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + auto layerConf = config_.inputs(0).multibox_loss_conf(); + numClasses_ = layerConf.num_classes(); + inputNum_ = layerConf.input_num(); + overlapThreshold_ = layerConf.overlap_threshold(); + negPosRatio_ = layerConf.neg_pos_ratio(); + negOverlap_ = layerConf.neg_overlap(); + backgroundId_ = layerConf.background_id(); + return true; +} + +void MultiBoxLossLayer::forward(PassType passType) { + Layer::forward(passType); + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + resetOutput(batchSize, 1); + + // all location data and confidence score data + locSizeSum_ = 0; + confSizeSum_ = 0; + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + locSizeSum_ += inLoc->getElementCnt(); + confSizeSum_ += inConf->getElementCnt(); + } + + // locBuffer layout: + // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ...... + Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_); + locBuffer_ = locTmpBuffer_; + + // confBuffer layout: + // | class1 score | class2 score | ... |classN score | class1 score | ...... + Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_); + confBuffer_ = confTmpBuffer_; + + // concate location data and confidence score data + size_t locOffset = 0; + size_t confOffset = 0; + auto& layerConf = config_.inputs(0).multibox_loss_conf(); + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + locOffset += appendWithPermute(*inLoc, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locBuffer_, + kNCHWToNHWC); + confOffset += appendWithPermute(*inConf, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confBuffer_, + kNCHWToNHWC); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); + + // priorValue layout: + // | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var + // | xmin2 | ...... + MatrixPtr priorValue; + + // labelValue layout: + // | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...... + MatrixPtr labelValue; + + // Copy data from GPU to CPU if use GPU + if (useGpu_) { + Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false); + Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false); + MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer()); + Matrix::resizeOrCreate( + priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false); + MatrixPtr labelTmpValue = getInputValue(*getLabelLayer()); + Matrix::resizeOrCreate(labelCpuValue_, + labelTmpValue->getHeight(), + labelTmpValue->getWidth(), + false, + false); + + locCpuBuffer_->copyFrom(*locTmpBuffer_); + confCpuBuffer_->copyFrom(*confTmpBuffer_); + priorCpuValue_->copyFrom(*priorTmpValue); + labelCpuValue_->copyFrom(*labelTmpValue); + + locBuffer_ = locCpuBuffer_; + confBuffer_ = confCpuBuffer_; + priorValue = priorCpuValue_; + labelValue = labelCpuValue_; + } else { + priorValue = getInputValue(*getPriorBoxLayer()); + labelValue = getInputValue(*getLabelLayer()); + } + + // Get max scores for each prior bbox. Used in negative mining + vector> allMaxConfScore; + numPriors_ = priorValue->getElementCnt() / 8; + getMaxConfidenceScores(confBuffer_->getData(), + batchSize, + numPriors_, + numClasses_, + backgroundId_, + &allMaxConfScore); + + // Match prior bbox to groundtruth bbox + Argument label = getInput(*getLabelLayer()); + const int* labelIndex = label.sequenceStartPositions->getData(false); + size_t seqNum = label.getNumSequences(); + numMatches_ = 0; + numNegs_ = 0; + allMatchIndices_.clear(); + allNegIndices_.clear(); + + pair retPair = generateMatchIndices(*priorValue, + numPriors_, + *labelValue, + labelIndex, + seqNum, + allMaxConfScore, + batchSize, + overlapThreshold_, + negOverlap_, + negPosRatio_, + &allMatchIndices_, + &allNegIndices_); + numMatches_ = retPair.first; + numNegs_ = retPair.second; + + // BBox location L1 smooth loss + locLoss_ = 0.0; + if (numMatches_ >= 1) { + size_t count = 0; + MatrixPtr locLossOutput; + Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false); + Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false); + Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false); + locDiff_->zeroMem(); + vector locGTData; + + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; // match none + size_t locOffset = + n * (locBuffer_->getElementCnt() / batchSize) + i * 4; + locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[0]; + locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[1]; + locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[2]; + locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[3]; + + const int gtIdx = allMatchIndices_[n][i]; + size_t priorOffset = i * 8; + vector priorBBoxVec; + getBBoxFromPriorData( + priorValue->getData() + priorOffset, 1, priorBBoxVec); + vector> priorBBoxVar; + getBBoxVarFromPriorData( + priorValue->getData() + priorOffset, 1, priorBBoxVar); + size_t labelOffset = (labelIndex[n] + gtIdx) * 6; + vector gtBBoxVec; + getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec); + vector gtEncode; + encodeBBoxWithVar( + priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode); + locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end()); + } + } + locGTData_->copyFrom(&locGTData[0], numMatches_ * 4); + locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0); + locLoss_ = locLossOutput->getSum() / numMatches_; + } + + // BBox confidence softmax loss + confLoss_ = 0; + numConf_ = numMatches_ + numNegs_; + if (numConf_ >= 1) { + Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false); + IVector::resizeOrCreate(confGTData_, numConf_, false); + confProb_->zeroMem(); + size_t count = 0; + + vector confPredData; + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6; + const int gtLabel = (labelValue->getData() + labelOffset)[0]; + confGTData_->getData()[count] = gtLabel; + size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_; + for (size_t j = 0; j < numClasses_; ++j) { + confProb_->getData()[count * numClasses_ + j] = + (confBuffer_->getData() + confOffset)[j]; + confPredData.push_back((confBuffer_->getData() + confOffset)[j]); + } + ++count; + } + // Negative mining samples + for (size_t i = 0; i < allNegIndices_[n].size(); ++i) { + confGTData_->getData()[count] = backgroundId_; + size_t confOffset = + n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_; + for (size_t j = 0; j < numClasses_; ++j) { + confProb_->getData()[count * numClasses_ + j] = + (confBuffer_->getData() + confOffset)[j]; + confPredData.push_back((confBuffer_->getData() + confOffset)[j]); + } + count++; + } + } + confProb_->softmax(*confProb_); + MatrixPtr confLossOutput; + Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false); + confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_); + confLoss_ = confLossOutput->getSum() / numMatches_; + } + real loss = locLoss_ + confLoss_; + MatrixPtr outV = getOutputValue(); + vector tmp(batchSize, loss); + outV->copyFrom(&tmp[0], batchSize); +} + +void MultiBoxLossLayer::backward(const UpdateCallback& callback) { + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + locBuffer_->zeroMem(); + confBuffer_->zeroMem(); + + // Back propagate on location prediction + if (numMatches_ >= 1) { + MatrixPtr locDiffBuffer; + Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false); + locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0); + locDiff_->copyFrom(*locDiffBuffer); + // scale gradient + for (size_t i = 0; i < numMatches_ * 4; ++i) + locDiff_->getData()[i] *= (1. / numMatches_); + // Copy gradient back + size_t count = 0; + for (size_t n = 0; n < batchSize; ++n) + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + real* locDiffData = locBuffer_->getData() + n * numPriors_ * 4 + i * 4; + locDiffData[0] = (locDiff_->getData() + count * 4)[0]; + locDiffData[1] = (locDiff_->getData() + count * 4)[1]; + locDiffData[2] = (locDiff_->getData() + count * 4)[2]; + locDiffData[3] = (locDiff_->getData() + count * 4)[3]; + ++count; + } + CHECK_EQ(count, numMatches_); + } + + if (numConf_ >= 1) { + for (size_t i = 0; i < numConf_; ++i) + confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1; + for (size_t i = 0; i < numConf_ * numClasses_; ++i) + confProb_->getData()[i] *= (1. / numMatches_); + size_t count = 0; + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + real* confDiffData = confBuffer_->getData() + + n * numPriors_ * numClasses_ + i * numClasses_; + for (size_t j = 0; j < numClasses_; ++j) + confDiffData[j] = (confProb_->getData() + count * numClasses_)[j]; + ++count; + } + for (size_t i = 0; i < allNegIndices_[n].size(); ++i) { + int idx = allNegIndices_[n][i]; + real* confDiffData = confBuffer_->getData() + + n * numPriors_ * numClasses_ + idx * numClasses_; + for (size_t j = 0; j < numClasses_; ++j) + confDiffData[j] = (confProb_->getData() + count * numClasses_)[j]; + ++count; + } + } + CHECK_EQ(count, numConf_); + } + if (useGpu_) { + locTmpBuffer_->copyFrom(*locCpuBuffer_); + confTmpBuffer_->copyFrom(*confCpuBuffer_); + locBuffer_ = locTmpBuffer_; + confBuffer_ = confTmpBuffer_; + } + // copy back + size_t locOffset = 0; + size_t confOffset = 0; + auto layerConf = config_.inputs(0).multibox_loss_conf(); + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n)); + const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n)); + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + + // NHWC to NCHW + MatrixPtr locGBuffer; + Matrix::resizeOrCreate( + locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_); + MatrixPtr confGBuffer; + Matrix::resizeOrCreate( + confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_); + + locOffset += decomposeWithPermute(*locBuffer_, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locGBuffer, + kNHWCToNCHW); + inLocG->add(*locGBuffer); + confOffset += decomposeWithPermute(*confBuffer_, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confGBuffer, + kNHWCToNCHW); + inConfG->add(*confGBuffer); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h new file mode 100644 index 0000000000..9767fed7f1 --- /dev/null +++ b/paddle/gserver/layers/MultiBoxLossLayer.h @@ -0,0 +1,103 @@ +/* copyright (c) 2016 paddlepaddle authors. all rights reserve. + +licensed under the apache license, version 2.0 (the "license"); +you may not use this file except in compliance with the license. +you may obtain a copy of the license at + + http://www.apache.org/licenses/license-2.0 + +unless required by applicable law or agreed to in writing, software +distributed under the license is distributed on an "as is" basis, +without warranties or conditions of any kind, either express or implied. +see the license for the specific language governing permissions and +limitations under the license. */ + +#pragma once + +#include +#include "CostLayer.h" +#include "DataLayer.h" +#include "DetectionUtil.h" +#include "Layer.h" + +using std::vector; +using std::pair; + +namespace paddle { + +/** + * The multibox loss layer for a SSD detection task. + * The loss is composed by the location loss and the confidence loss. + * The location loss is a smooth L1 loss and the confidence loss is + * a softmax loss. + * - Input: This layer need four input layers: This first input layer + * is the priorbox layer and the second layer is a label layer. + * The rest two input layers are convolution layers for generating + * bbox location offset and the classification confidence. + * - Output: The Single Shot Multibox Detection loss value. + * Reference: + * Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, + * Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector + */ + +class MultiBoxLossLayer : public CostLayer { +public: + explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {} + + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + + void backward(const UpdateCallback& callback = nullptr); + + void forwardImp(Matrix& output, Argument& label, Matrix& cost) {} + + void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {} + +protected: + inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; } + inline LayerPtr getLabelLayer() { return inputLayers_[1]; } + inline LayerPtr getLocInputLayer(size_t index) { + return inputLayers_[2 + index]; + } + inline LayerPtr getConfInputLayer(size_t index) { + return inputLayers_[2 + inputNum_ + index]; + } + +protected: + size_t numClasses_; + real overlapThreshold_; + real negPosRatio_; + real negOverlap_; + size_t inputNum_; + size_t backgroundId_; + + real locLoss_; + real confLoss_; + + size_t numPriors_; + size_t numMatches_; + size_t numNegs_; + size_t numConf_; + size_t locSizeSum_; + size_t confSizeSum_; + + vector> allMatchIndices_; + vector> allNegIndices_; + MatrixPtr locGTData_; + IVectorPtr confGTData_; + + MatrixPtr locBuffer_; + MatrixPtr confBuffer_; + MatrixPtr locDiff_; + MatrixPtr confProb_; + + MatrixPtr labelCpuValue_; + MatrixPtr priorCpuValue_; + MatrixPtr locCpuBuffer_; + MatrixPtr confCpuBuffer_; + MatrixPtr locTmpBuffer_; + MatrixPtr confTmpBuffer_; +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 3c4128b5b8..92f6cbcfe5 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -45,6 +45,13 @@ add_unittest_without_exec(test_PriorBox add_test(NAME test_PriorBox COMMAND test_PriorBox) +################# test_DetectionOutput ####################### +add_unittest_without_exec(test_DetectionOutput + test_DetectionOutput.cpp + LayerGradUtil.cpp) + +add_test(NAME test_DetectionOutput + COMMAND test_DetectionOutput) ################# test_ConvUnify ####################### add_unittest_without_exec(test_ConvUnify test_ConvUnify.cpp diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index a0b1cd471d..e3591ba4df 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -387,6 +387,31 @@ void initDataLayer(TestConfig testConf, data.value->sigmoid(*data.value); data.grad->zeroMem(); break; + case INPUT_SELF_DEFINE_DATA: { + size_t height = testConf.inputDefs[i].selfDefinedData->getHeight(); + size_t width = testConf.inputDefs[i].selfDefinedData->getWidth(); + CHECK_GT(static_cast(height), 0); + CHECK_GT(static_cast(width), 0); + data.value = Matrix::create(height, width, false, useGpu); + data.grad = Matrix::create(height, width, false, useGpu); + data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData); + data.grad->zeroMem(); + + const std::vector& labelSeqStartPositions = + testConf.inputDefs[i].labelSeqStartPositions; + if (labelSeqStartPositions.size() != 0) { + CHECK(!sequenceStartPositions); + CHECK_GE(static_cast(labelSeqStartPositions.size()), 2); + + sequenceStartPositions = + ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu); + sequenceStartPositions->copyFrom(labelSeqStartPositions.data(), + labelSeqStartPositions.size(), + useGpu); + data.sequenceStartPositions = sequenceStartPositions; + } + break; + } default: LOG(FATAL) << " unknown inputType "; return; diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index 9f68eb64d0..18a6525a14 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -31,7 +31,8 @@ enum InputType { INPUT_SEQUENCE_LABEL, INPUT_SPARSE_NON_VALUE_DATA, INPUT_SPARSE_FLOAT_VALUE_DATA, - INPUT_DENSE_DIM_DATA, // using sequence length to init dense data + INPUT_DENSE_DIM_DATA, // using sequence length to init dense data + INPUT_SELF_DEFINE_DATA, // support customizing for input value }; struct ParaSparse { @@ -66,6 +67,7 @@ struct InputDef { bool isStatic; std::vector labelInitValue; std::vector labelSeqStartPositions; + MatrixPtr selfDefinedData; InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) { inputType = type; @@ -76,6 +78,20 @@ struct InputDef { isStatic = false; } + InputDef(InputType type, + string nameIn, + MatrixPtr selfDefinedData, + std::vector selfDefinedSeqStartPos = {}) + : labelSeqStartPositions(selfDefinedSeqStartPos), + selfDefinedData(selfDefinedData) { + inputType = type; + name = nameIn; + dim = 0; + sparse = {""}; + paraSize = 0; + isStatic = false; + } + InputDef(InputType type, string nameIn, size_t dimIn, diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp new file mode 100644 index 0000000000..8ec7a28450 --- /dev/null +++ b/paddle/gserver/tests/test_DetectionOutput.cpp @@ -0,0 +1,191 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "LayerGradUtil.h" +#include "paddle/testing/TestUtil.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +// Do one forward pass of priorBox layer and check to see if its output +// matches the given result +void doOneDetectionOutputTest(MatrixPtr& inputLoc, + MatrixPtr& inputConf, + MatrixPtr& inputPriorBox, + size_t feature_map_width, + size_t feature_map_height, + real nms_threshold, + bool use_gpu, + MatrixPtr& result) { + // Setting up the detection output layer + TestConfig configt; + configt.layerConfig.set_type("detection_output"); + LayerInputConfig* input = configt.layerConfig.add_inputs(); + configt.layerConfig.add_inputs(); + configt.layerConfig.add_inputs(); + + DetectionOutputConfig* detOutput = input->mutable_detection_output_conf(); + detOutput->set_width(feature_map_width); + detOutput->set_height(feature_map_height); + detOutput->set_nms_threshold(nms_threshold); + detOutput->set_num_classes(2); + detOutput->set_nms_top_k(20); + detOutput->set_keep_top_k(10); + detOutput->set_background_id(0); + detOutput->set_confidence_threshold(0.01); + detOutput->set_input_num(1); + configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0}); + configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0}); + configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0}); + + // data layer initialize + std::vector dataLayers; + LayerMap layerMap; + vector datas; + initDataLayer( + configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu); + + dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox); + dataLayers[1]->getOutputValue()->copyFrom(*inputLoc); + dataLayers[2]->getOutputValue()->copyFrom(*inputConf); + + // test layer initialize + std::vector parameters; + LayerPtr detectionOutputLayer; + initTestLayer(configt, &layerMap, ¶meters, &detectionOutputLayer); + detectionOutputLayer->forward(PASS_GC); + checkMatrixEqual(detectionOutputLayer->getOutputValue(), result); +} + +TEST(Layer, detectionOutputLayerFwd) { + bool useGpu = false; + // CPU case 1. + MatrixPtr inputLoc; + MatrixPtr inputConf; + MatrixPtr inputPriorBox; + MatrixPtr result, result2, result3, result4; + real nmsTreshold = 0.01; + real inputLocData[] = {0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1}; + real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6}; + real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, + 0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2, + 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, + 0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2}; + real resultData[] = { + 0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031}; + inputLoc = Matrix::create(1, 16, false, useGpu); + inputConf = Matrix::create(1, 8, false, useGpu); + inputPriorBox = Matrix::create(1, 32, false, useGpu); + result = Matrix::create(1, 7, false, useGpu); + inputLoc->setData(inputLocData); + inputConf->setData(inputConfData); + inputPriorBox->setData(inputPriorBoxData); + result->setData(resultData); + doOneDetectionOutputTest(inputLoc, + inputConf, + inputPriorBox, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsTreshold, + useGpu, + result); + + // CPU case 2. + nmsTreshold = 0.2; + result2 = Matrix::create(2, 7, false, useGpu); + real resultData2[] = {0, + 1, + 0.68997443, + 0.099959746, + 0.099959746, + 0.50804031, + 0.50804031, + 0, + 1, + 0.59868765, + 0.29995975, + 0.29995975, + 0.70804024, + 0.70804024}; + result2->setData(resultData2); + doOneDetectionOutputTest(inputLoc, + inputConf, + inputPriorBox, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsTreshold, + useGpu, + result2); + +#ifndef PADDLE_ONLY_CPU + // GPU case 1. + useGpu = true; + inputLoc = Matrix::create(1, 16, false, useGpu); + inputConf = Matrix::create(1, 8, false, useGpu); + inputPriorBox = Matrix::create(1, 32, false, useGpu); + inputLoc->copyFrom(inputLocData, 16); + inputConf->copyFrom(inputConfData, 8); + inputPriorBox->copyFrom(inputPriorBoxData, 32); + + nmsTreshold = 0.01; + result3 = Matrix::create(1, 7, false, useGpu); + result3->copyFrom(resultData, 7); + doOneDetectionOutputTest(inputLoc, + inputConf, + inputPriorBox, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsTreshold, + useGpu, + result3); + + // GPU case 2. + nmsTreshold = 0.2; + result4 = Matrix::create(2, 7, false, useGpu); + result4->copyFrom(resultData2, 14); + doOneDetectionOutputTest(inputLoc, + inputConf, + inputPriorBox, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsTreshold, + useGpu, + result4); +#endif +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 6adffcf53b..9c79bd19ee 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1689,6 +1689,70 @@ TEST(Layer, smooth_l1) { } } +TEST(Layer, multibox_loss) { + TestConfig config; + config.layerConfig.set_type("multibox_loss"); + config.biasSize = 0; + LayerInputConfig* input = config.layerConfig.add_inputs(); + MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf(); + multiboxLoss->set_num_classes(21); + multiboxLoss->set_input_num(1); + multiboxLoss->set_overlap_threshold(0.5); + multiboxLoss->set_neg_pos_ratio(3); + multiboxLoss->set_neg_overlap(0.5); + multiboxLoss->set_background_id(0); + multiboxLoss->set_height(3); + multiboxLoss->set_width(3); + + size_t gtNum = 1; + MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false); + labelValue->randomizeUniform(); + labelValue->add(-0.5); + labelValue->sigmoid(*labelValue); + real* labelData = labelValue->getData(); + size_t labelWidth = labelValue->getWidth(); + for (size_t i = 0; i < gtNum; ++i) { + *(labelData + i * labelWidth) = std::rand() % 20 + 1; + *(labelData + i * labelWidth + 1) = 0.400259; + *(labelData + i * labelWidth + 2) = 0.377857; + *(labelData + i * labelWidth + 3) = 0.525712; + *(labelData + i * labelWidth + 4) = 0.519368; + } + vector seqStartPositions(gtNum + 1, 0); + for (size_t i = 1; i <= gtNum; ++i) { + seqStartPositions[i] = i; + } + + // Ensure at lease one matched bbox + MatrixPtr priorValue = Matrix::create(1, 72, false, false); + priorValue->randomizeUniform(); + priorValue->add(-0.5); + priorValue->sigmoid(*priorValue); + real* priorData = priorValue->getData(); + *(priorData) = 0.424811; + *(priorData + 1) = 0.397059; + *(priorData + 2) = 0.538905; + *(priorData + 3) = 0.447091; + *(priorData + 4) = 0.425720; + *(priorData + 5) = 0.515228; + *(priorData + 6) = 0.519452; + *(priorData + 7) = 0.591065; + + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}}); + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions}); + config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0}); + config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0}); + config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "multibox_loss", 1, false, useGpu, false); + } +} + TEST(Layer, TransLayer) { TestConfig config; const int height = 128; diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 29270829bb..3d01c23bf9 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -266,6 +266,29 @@ message PadConfig { repeated uint32 pad_w = 4; } +message MultiBoxLossConfig { + required uint32 num_classes = 1; + required float overlap_threshold = 2; + required float neg_pos_ratio = 3; + required float neg_overlap = 4; + required uint32 background_id = 5; + required uint32 input_num = 6; + optional uint32 height = 7 [default = 1]; + optional uint32 width = 8 [default = 1]; +} + +message DetectionOutputConfig { + required uint32 num_classes = 1; + required float nms_threshold = 2; + required uint32 nms_top_k = 3; + required uint32 background_id = 4; + required uint32 input_num = 5; + required uint32 keep_top_k = 6; + required float confidence_threshold = 7; + optional uint32 height = 8 [default = 1]; + optional uint32 width = 9 [default = 1]; +} + message LayerInputConfig { required string input_layer_name = 1; optional string input_parameter_name = 2; @@ -284,6 +307,8 @@ message LayerInputConfig { optional PriorBoxConfig priorbox_conf = 13; optional PadConfig pad_conf = 14; optional RowConvConfig row_conv_conf = 15; + optional MultiBoxLossConfig multibox_loss_conf = 16; + optional DetectionOutputConfig detection_output_conf = 17; } message LayerConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index fc2e3bbcde..c46b335d99 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1676,6 +1676,52 @@ class PriorBoxLayer(LayerBase): self.config.size = size +@config_layer('multibox_loss') +class MultiBoxLossLayer(LayerBase): + def __init__(self, name, inputs, input_num, num_classes, overlap_threshold, + neg_pos_ratio, neg_overlap, background_id): + super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0, + inputs) + config_assert( + len(inputs) == (input_num * 2 + 2), + 'MultiBoxLossLayer does not have enough inputs') + config_assert(num_classes > background_id, + 'Classes number must greater than background ID') + self.config.inputs[0].multibox_loss_conf.num_classes = num_classes + self.config.inputs[ + 0].multibox_loss_conf.overlap_threshold = overlap_threshold + self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio + self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap + self.config.inputs[0].multibox_loss_conf.background_id = background_id + self.config.inputs[0].multibox_loss_conf.input_num = input_num + self.config.size = 1 + + +@config_layer('detection_output') +class DetectionOutputLayer(LayerBase): + def __init__(self, name, inputs, size, input_num, num_classes, + nms_threshold, nms_top_k, keep_top_k, confidence_threshold, + background_id): + super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0, + inputs) + config_assert( + len(inputs) == (input_num * 2 + 1), + 'DetectionOutputLayer does not have enough inputs') + config_assert(num_classes > background_id, + 'Classes number must greater than background ID') + self.config.inputs[0].detection_output_conf.num_classes = num_classes + self.config.inputs[ + 0].detection_output_conf.nms_threshold = nms_threshold + self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k + self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k + self.config.inputs[ + 0].detection_output_conf.confidence_threshold = confidence_threshold + self.config.inputs[ + 0].detection_output_conf.background_id = background_id + self.config.inputs[0].detection_output_conf.input_num = input_num + self.config.size = size + + @config_layer('data') class DataLayer(LayerBase): def __init__(self, name, size, height=None, width=None, device=None): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 2d8ddbb900..770559dc77 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -115,6 +115,8 @@ __all__ = [ 'print_layer', 'priorbox_layer', 'cross_channel_norm_layer', + 'multibox_loss_layer', + 'detection_output_layer', 'spp_layer', 'pad_layer', 'eos_layer', @@ -195,6 +197,8 @@ class LayerType(object): PRINT_LAYER = 'print' PRIORBOX_LAYER = 'priorbox' + MULTIBOX_LOSS_LAYER = 'multibox_loss' + DETECTION_OUTPUT_LAYER = 'detection_output' CTC_LAYER = 'ctc' WARP_CTC_LAYER = 'warp_ctc' @@ -1052,6 +1056,163 @@ def priorbox_layer(input, size=size) +@wrap_name_default("multibox_loss") +def multibox_loss_layer(input_loc, + input_conf, + priorbox, + label, + num_classes, + overlap_threshold=0.5, + neg_pos_ratio=3.0, + neg_overlap=0.5, + background_id=0, + name=None): + """ + Compute the location loss and the confidence loss for ssd. + + :param name: The Layer Name. + :type name: basestring + :param input_loc: The input predict location. + :type input_loc: LayerOutput + :param input_conf: The input priorbox confidence. + :type input_conf: LayerOutput + :param priorbox: The input priorbox location and the variance. + :type priorbox: LayerOutput + :param label: The input label. + :type label: LayerOutput + :param num_classes: The number of the classification. + :type num_classes: int + :param overlap_threshold: The threshold of the overlap. + :type overlap_threshold: float + :param neg_pos_ratio: The ratio of the negative bbox to the positive bbox. + :type neg_pos_ratio: float + :param neg_overlap: The negative bbox overlap threshold. + :type neg_overlap: float + :param background_id: The background class index. + :type background_id: int + :return: LayerOutput + """ + input_loc_num = 0 + input_conf_num = 0 + + if isinstance(input_loc, LayerOutput): + input_loc = [input_loc] + assert isinstance(input_loc, collections.Sequence) # list or tuple + for each in input_loc: + assert isinstance(each, LayerOutput) + input_loc_num += 1 + + if isinstance(input_conf, LayerOutput): + input_conf = [input_conf] + assert isinstance(input_conf, collections.Sequence) # list or tuple + for each in input_conf: + assert isinstance(each, LayerOutput) + input_conf_num += 1 + # Check the input layer number. + assert input_loc_num == input_conf_num + + inputs = [priorbox.name, label.name] + inputs.extend([l.name for l in input_loc]) + inputs.extend([l.name for l in input_conf]) + parents = [priorbox, label] + parents.extend(input_loc) + parents.extend(input_conf) + + Layer( + name=name, + type=LayerType.MULTIBOX_LOSS_LAYER, + inputs=inputs, + input_num=input_loc_num, + num_classes=num_classes, + overlap_threshold=overlap_threshold, + neg_pos_ratio=neg_pos_ratio, + neg_overlap=neg_overlap, + background_id=background_id) + return LayerOutput( + name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1) + + +@wrap_name_default("detection_output") +def detection_output_layer(input_loc, + input_conf, + priorbox, + num_classes, + nms_threshold=0.45, + nms_top_k=400, + keep_top_k=200, + confidence_threshold=0.01, + background_id=0, + name=None): + """ + Apply the NMS to the output of network and compute the predict bounding + box location. + + :param name: The Layer Name. + :type name: basestring + :param input_loc: The input predict location. + :type input_loc: LayerOutput + :param input_conf: The input priorbox confidence. + :type input_conf: LayerOutput + :param priorbox: The input priorbox location and the variance. + :type priorbox: LayerOutput + :param num_classes: The number of the classification. + :type num_classes: int + :param nms_threshold: The Non-maximum suppression threshold. + :type nms_threshold: float + :param nms_top_k: The bbox number kept of the NMS's output + :type nms_top_k: int + :param keep_top_k: The bbox number kept of the layer's output + :type keep_top_k: int + :param confidence_threshold: The classification confidence threshold + :type confidence_threshold: float + :param background_id: The background class index. + :type background_id: int + :return: LayerOutput + """ + input_loc_num = 0 + input_conf_num = 0 + + if isinstance(input_loc, LayerOutput): + input_loc = [input_loc] + assert isinstance(input_loc, collections.Sequence) # list or tuple + for each in input_loc: + assert isinstance(each, LayerOutput) + input_loc_num += 1 + + if isinstance(input_conf, LayerOutput): + input_conf = [input_conf] + assert isinstance(input_conf, collections.Sequence) # list or tuple + for each in input_conf: + assert isinstance(each, LayerOutput) + input_conf_num += 1 + # Check the input layer number. + assert input_loc_num == input_conf_num + + inputs = [priorbox.name] + inputs.extend([l.name for l in input_loc]) + inputs.extend([l.name for l in input_conf]) + parents = [priorbox] + parents.extend(input_loc) + parents.extend(input_conf) + + size = keep_top_k * 7 + + Layer( + name=name, + type=LayerType.DETECTION_OUTPUT_LAYER, + inputs=inputs, + size=size, + input_num=input_loc_num, + num_classes=num_classes, + nms_threshold=nms_threshold, + nms_top_k=nms_top_k, + keep_top_k=keep_top_k, + confidence_threshold=confidence_threshold, + background_id=background_id) + return LayerOutput( + name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size) + + @wrap_name_default("cross_channel_norm") def cross_channel_norm_layer(input, name=None, param_attr=None): """ From b233ed135352de1260b644112f939938798048ec Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 19 Jun 2017 14:53:59 +0800 Subject: [PATCH 02/79] Set FLAGS_use_gpu in test_DetectionOutput. --- paddle/gserver/tests/test_DetectionOutput.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp index 8ec7a28450..af43dc51fa 100644 --- a/paddle/gserver/tests/test_DetectionOutput.cpp +++ b/paddle/gserver/tests/test_DetectionOutput.cpp @@ -65,9 +65,12 @@ void doOneDetectionOutputTest(MatrixPtr& inputLoc, dataLayers[2]->getOutputValue()->copyFrom(*inputConf); // test layer initialize + bool store_FLAGS_use_gpu = FLAGS_use_gpu; + FLAGS_use_gpu = use_gpu; std::vector parameters; LayerPtr detectionOutputLayer; initTestLayer(configt, &layerMap, ¶meters, &detectionOutputLayer); + FLAGS_use_gpu = store_FLAGS_use_gpu; detectionOutputLayer->forward(PASS_GC); checkMatrixEqual(detectionOutputLayer->getOutputValue(), result); } From 3919b75884749684e0bd8b502e426fa4949f2c1f Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 28 Jun 2017 12:01:32 +0000 Subject: [PATCH 03/79] modify cmake --- go/master/c/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt index acce698051..3eb598a877 100644 --- a/go/master/c/CMakeLists.txt +++ b/go/master/c/CMakeLists.txt @@ -6,7 +6,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake") project(cxx_go C Go) -include(golang) +#include(golang) include(flags) set(MASTER_LIB_NAME "paddle_master") From fc3d03142582dcd673cc97fb3b0239bac59815f4 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 29 Jun 2017 09:38:25 +0800 Subject: [PATCH 04/79] first add --- go/master/c/client.go | 5 ++ go/master/client.go | 3 +- python/paddle/v2/master/client.py | 3 ++ python/paddle/v2/reader/creator.py | 49 ++++++++++++++----- python/paddle/v2/reader/tests/creator_test.py | 2 +- 5 files changed, 49 insertions(+), 13 deletions(-) diff --git a/go/master/c/client.go b/go/master/c/client.go index b186474dc3..b88911b858 100644 --- a/go/master/c/client.go +++ b/go/master/c/client.go @@ -88,7 +88,12 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { c := get(client) r := c.NextRecord() + if r == nil { + // EOF + return -1 + } if len(r) == 0 { + // Empty record *record = (*C.uchar)(nullPtr) return 0 } diff --git a/go/master/client.go b/go/master/client.go index 8451820c19..4f8df5ba66 100644 --- a/go/master/client.go +++ b/go/master/client.go @@ -60,6 +60,7 @@ func (c *Client) getRecords() { } err = f.Close() + c.ch <- nil if err != nil { log.Errorln(err) } @@ -112,7 +113,7 @@ func (c *Client) monitorMaster(addr Addresser) { // // SetDataset can be call multiple times from different nodes. But // only the first call will be honored. -func (c *Client) SetDataset(globPaths []string) error { +func (c *Client) SetDataset(globPaths ...string) error { return c.conn.Call("Service.SetDataset", globPaths, nil) } diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py index de8e9bb88e..9fd3ef0860 100644 --- a/python/paddle/v2/master/client.py +++ b/python/paddle/v2/master/client.py @@ -30,6 +30,9 @@ class client(object): p = ctypes.c_char_p() ret = ctypes.pointer(p) size = lib.paddle_next_record(self.c, ret) + if size < 0: + # EOF + return None if size == 0: # Empty record return "" diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 9f888b16d6..669867fd10 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -57,22 +57,49 @@ def text_file(path): return reader -def recordio(path): +def recordio_local(paths): """ - Creates a data reader that outputs record one one by one from given recordio file - :path: path of recordio file - :returns: data reader of recordio file + Creates a data reader that outputs record one one by one + from given local recordio fils path. + :path: path of recordio files. + :returns: data reader of recordio files. """ import recordio as rec def reader(): - f = rec.reader(path) - while True: - r = f.read() - if r is None: - break - yield r - f.close() + for i, path in enumerate(paths): + f = rec.reader(path) + while True: + r = f.read() + if r is None: + break + yield r + f.close() return reader + + +def recordio(paths, addr="", buf_size=100): + """ + Creates a data reader that outputs record one one by one + from given local or cloud recordio path. + :path: path of recordio files. + :returns: data reader of recordio files. + """ + import os + import paddle.v2.master.client as cloud + + if len(os.environ["KUBERNETES_SERVICE_HOST"]) == 0: + return recordio_local(path) + + c = cloud(addr, buf_size) + c.set_dataset(paths) + + while True: + r = client.next_record() + if r is None: + break + yield r + + c.close() diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py index ba4f558874..b42d273ecf 100644 --- a/python/paddle/v2/reader/tests/creator_test.py +++ b/python/paddle/v2/reader/tests/creator_test.py @@ -38,7 +38,7 @@ class TestRecordIO(unittest.TestCase): def test_recordio(self): path = os.path.join( os.path.dirname(__file__), "test_recordio_creator.dat") - reader = paddle.v2.reader.creator.recordio(path) + reader = paddle.v2.reader.creator.recordio([path]) for idx, r in enumerate(reader()): self.assertSequenceEqual(r, str(idx)) From 4874810ba5a1e6f8f6b4a9530e6854f65077a59e Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 29 Jun 2017 04:28:44 +0000 Subject: [PATCH 05/79] fix bugs --- go/master/client.go | 2 +- python/paddle/v2/reader/creator.py | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/go/master/client.go b/go/master/client.go index 4f8df5ba66..fa479338c5 100644 --- a/go/master/client.go +++ b/go/master/client.go @@ -113,7 +113,7 @@ func (c *Client) monitorMaster(addr Addresser) { // // SetDataset can be call multiple times from different nodes. But // only the first call will be honored. -func (c *Client) SetDataset(globPaths ...string) error { +func (c *Client) SetDataset(globPaths []string) error { return c.conn.Call("Service.SetDataset", globPaths, nil) } diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 669867fd10..3376d7accb 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -93,13 +93,17 @@ def recordio(paths, addr="", buf_size=100): if len(os.environ["KUBERNETES_SERVICE_HOST"]) == 0: return recordio_local(path) - c = cloud(addr, buf_size) - c.set_dataset(paths) + def reader(): + c = cloud(addr, buf_size) + c.set_dataset(paths) + + while True: + r = client.next_record() + if r is None: + break + yield r - while True: - r = client.next_record() - if r is None: - break - yield r + c.close() + + return reader - c.close() From b5ab4b69bcfa604a1ebbb964da1765ff2c586a6a Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 29 Jun 2017 15:11:40 +0800 Subject: [PATCH 06/79] Follow comments, mainly use std::copy to simplify logic. --- .../gserver/layers/DetectionOutputLayer.cpp | 20 ++-- paddle/gserver/layers/DetectionOutputLayer.h | 6 +- paddle/gserver/layers/MultiBoxLossLayer.cpp | 109 ++++++++++-------- python/paddle/trainer/config_parser.py | 4 +- .../paddle/trainer_config_helpers/layers.py | 7 +- 5 files changed, 74 insertions(+), 72 deletions(-) diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp index 2a4d7f8b5b..8ab838e191 100644 --- a/paddle/gserver/layers/DetectionOutputLayer.cpp +++ b/paddle/gserver/layers/DetectionOutputLayer.cpp @@ -48,8 +48,6 @@ void DetectionOutputLayer::forward(PassType passType) { Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_); Matrix::resizeOrCreate( confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_); - locBuffer_ = locTmpBuffer_; - confBuffer_ = confTmpBuffer_; size_t locOffset = 0; size_t confOffset = 0; @@ -68,7 +66,7 @@ void DetectionOutputLayer::forward(PassType passType) { locSizeSum_, locOffset, batchSize, - *locBuffer_, + *locTmpBuffer_, kNCHWToNHWC); confOffset += appendWithPermute(*inConf, height, @@ -76,7 +74,7 @@ void DetectionOutputLayer::forward(PassType passType) { confSizeSum_, confOffset, batchSize, - *confBuffer_, + *confTmpBuffer_, kNCHWToNHWC); } CHECK_EQ(locOffset, locSizeSum_ / batchSize); @@ -100,23 +98,25 @@ void DetectionOutputLayer::forward(PassType passType) { priorValue = priorCpuValue_; } else { priorValue = getInputValue(*getPriorBoxLayer()); + locBuffer_ = locTmpBuffer_; + confBuffer_ = confTmpBuffer_; } confBuffer_->softmax(*confBuffer_); size_t numPriors = priorValue->getElementCnt() / 8; - vector> allDecodedBBoxes; + std::vector> allDecodedBBoxes; for (size_t n = 0; n < batchSize; ++n) { - vector decodedBBoxes; + std::vector decodedBBoxes; for (size_t i = 0; i < numPriors; ++i) { size_t priorOffset = i * 8; size_t locPredOffset = n * numPriors * 4 + i * 4; - vector priorBBoxVec; + std::vector priorBBoxVec; getBBoxFromPriorData( priorValue->getData() + priorOffset, 1, priorBBoxVec); - vector> priorBBoxVar; + std::vector> priorBBoxVar; getBBoxVarFromPriorData( priorValue->getData() + priorOffset, 1, priorBBoxVar); - vector locPredData; + std::vector locPredData; for (size_t j = 0; j < 4; ++j) locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j)); NormalizedBBox bbox = @@ -126,7 +126,7 @@ void DetectionOutputLayer::forward(PassType passType) { allDecodedBBoxes.push_back(decodedBBoxes); } - vector>> allIndices; + std::vector>> allIndices; size_t numKept = getDetectionIndices(confBuffer_->getData(), numPriors, numClasses_, diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h index 38271cb054..9cc568219c 100644 --- a/paddle/gserver/layers/DetectionOutputLayer.h +++ b/paddle/gserver/layers/DetectionOutputLayer.h @@ -19,17 +19,13 @@ limitations under the License. */ #include "DetectionUtil.h" #include "Layer.h" -using std::vector; -using std::map; -using std::pair; - namespace paddle { /** * The detection output layer for a SSD detection task. This layer apply the * Non-maximum suppression to the all predicted bounding box and keep the * Top-K bounding boxes. - * - Input: This layer need three input layers: This first input layer + * - Input: This layer needs three input layers: This first input layer * is the priorbox layer. The rest two input layers are convolution * layers for generating bbox location offset and the classification * confidence. diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp index 27a2cc3fa4..f2d7b8eb1d 100644 --- a/paddle/gserver/layers/MultiBoxLossLayer.cpp +++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp @@ -17,10 +17,6 @@ limitations under the License. */ #include #include "DataLayer.h" -using std::vector; -using std::map; -using std::pair; - namespace paddle { REGISTER_LAYER(multibox_loss, MultiBoxLossLayer); @@ -133,7 +129,7 @@ void MultiBoxLossLayer::forward(PassType passType) { } // Get max scores for each prior bbox. Used in negative mining - vector> allMaxConfScore; + std::vector> allMaxConfScore; numPriors_ = priorValue->getElementCnt() / 8; getMaxConfidenceScores(confBuffer_->getData(), batchSize, @@ -151,18 +147,18 @@ void MultiBoxLossLayer::forward(PassType passType) { allMatchIndices_.clear(); allNegIndices_.clear(); - pair retPair = generateMatchIndices(*priorValue, - numPriors_, - *labelValue, - labelIndex, - seqNum, - allMaxConfScore, - batchSize, - overlapThreshold_, - negOverlap_, - negPosRatio_, - &allMatchIndices_, - &allNegIndices_); + std::pair retPair = generateMatchIndices(*priorValue, + numPriors_, + *labelValue, + labelIndex, + seqNum, + allMaxConfScore, + batchSize, + overlapThreshold_, + negOverlap_, + negPosRatio_, + &allMatchIndices_, + &allNegIndices_); numMatches_ = retPair.first; numNegs_ = retPair.second; @@ -175,30 +171,31 @@ void MultiBoxLossLayer::forward(PassType passType) { Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false); Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false); locDiff_->zeroMem(); - vector locGTData; + std::vector locGTData; + real* locDiffData = locDiff_->getData(); + const real* locBufferData = locBuffer_->getData(); for (size_t n = 0; n < batchSize; ++n) { for (size_t i = 0; i < numPriors_; ++i) { if (allMatchIndices_[n][i] == -1) continue; // match none size_t locOffset = n * (locBuffer_->getElementCnt() / batchSize) + i * 4; - locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[0]; - locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[1]; - locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[2]; - locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[3]; - + std::copy(locBufferData + locOffset, + locBufferData + locOffset + 4, + locDiffData + count); + count += 4; const int gtIdx = allMatchIndices_[n][i]; size_t priorOffset = i * 8; - vector priorBBoxVec; + std::vector priorBBoxVec; getBBoxFromPriorData( priorValue->getData() + priorOffset, 1, priorBBoxVec); - vector> priorBBoxVar; + std::vector> priorBBoxVar; getBBoxVarFromPriorData( priorValue->getData() + priorOffset, 1, priorBBoxVar); size_t labelOffset = (labelIndex[n] + gtIdx) * 6; - vector gtBBoxVec; + std::vector gtBBoxVec; getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec); - vector gtEncode; + std::vector gtEncode; encodeBBoxWithVar( priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode); locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end()); @@ -218,7 +215,9 @@ void MultiBoxLossLayer::forward(PassType passType) { confProb_->zeroMem(); size_t count = 0; - vector confPredData; + std::vector confPredData; + real* confProbData = confProb_->getData(); + const real* confBufferData = confBuffer_->getData(); for (size_t n = 0; n < batchSize; ++n) { for (size_t i = 0; i < numPriors_; ++i) { if (allMatchIndices_[n][i] == -1) continue; @@ -226,11 +225,13 @@ void MultiBoxLossLayer::forward(PassType passType) { const int gtLabel = (labelValue->getData() + labelOffset)[0]; confGTData_->getData()[count] = gtLabel; size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_; - for (size_t j = 0; j < numClasses_; ++j) { - confProb_->getData()[count * numClasses_ + j] = - (confBuffer_->getData() + confOffset)[j]; - confPredData.push_back((confBuffer_->getData() + confOffset)[j]); - } + std::copy(confBufferData + confOffset, + confBufferData + confOffset + numClasses_, + confProbData + count * numClasses_); + confPredData.reserve(confPredData.size() + numClasses_); + confPredData.insert(confPredData.end(), + confBufferData + confOffset, + confBufferData + confOffset + numClasses_); ++count; } // Negative mining samples @@ -238,14 +239,17 @@ void MultiBoxLossLayer::forward(PassType passType) { confGTData_->getData()[count] = backgroundId_; size_t confOffset = n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_; - for (size_t j = 0; j < numClasses_; ++j) { - confProb_->getData()[count * numClasses_ + j] = - (confBuffer_->getData() + confOffset)[j]; - confPredData.push_back((confBuffer_->getData() + confOffset)[j]); - } - count++; + std::copy(confBufferData + confOffset, + confBufferData + confOffset + numClasses_, + confProbData + count * numClasses_); + confPredData.reserve(confPredData.size() + numClasses_); + confPredData.insert(confPredData.end(), + confBufferData + confOffset, + confBufferData + confOffset + numClasses_); + ++count; } } + CHECK_EQ(numConf_, count); confProb_->softmax(*confProb_); MatrixPtr confLossOutput; Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false); @@ -254,7 +258,7 @@ void MultiBoxLossLayer::forward(PassType passType) { } real loss = locLoss_ + confLoss_; MatrixPtr outV = getOutputValue(); - vector tmp(batchSize, loss); + std::vector tmp(batchSize, loss); outV->copyFrom(&tmp[0], batchSize); } @@ -274,16 +278,18 @@ void MultiBoxLossLayer::backward(const UpdateCallback& callback) { locDiff_->getData()[i] *= (1. / numMatches_); // Copy gradient back size_t count = 0; - for (size_t n = 0; n < batchSize; ++n) + const real* locDiffData = locDiff_->getData(); + for (size_t n = 0; n < batchSize; ++n) { for (size_t i = 0; i < numPriors_; ++i) { if (allMatchIndices_[n][i] == -1) continue; - real* locDiffData = locBuffer_->getData() + n * numPriors_ * 4 + i * 4; - locDiffData[0] = (locDiff_->getData() + count * 4)[0]; - locDiffData[1] = (locDiff_->getData() + count * 4)[1]; - locDiffData[2] = (locDiff_->getData() + count * 4)[2]; - locDiffData[3] = (locDiff_->getData() + count * 4)[3]; + real* locBufferData = + locBuffer_->getData() + n * numPriors_ * 4 + i * 4; + std::copy(locDiffData + count * 4, + locDiffData + (count + 1) * 4, + locBufferData); ++count; } + } CHECK_EQ(count, numMatches_); } @@ -293,21 +299,24 @@ void MultiBoxLossLayer::backward(const UpdateCallback& callback) { for (size_t i = 0; i < numConf_ * numClasses_; ++i) confProb_->getData()[i] *= (1. / numMatches_); size_t count = 0; + const real* confProbData = confProb_->getData(); for (size_t n = 0; n < batchSize; ++n) { for (size_t i = 0; i < numPriors_; ++i) { if (allMatchIndices_[n][i] == -1) continue; real* confDiffData = confBuffer_->getData() + n * numPriors_ * numClasses_ + i * numClasses_; - for (size_t j = 0; j < numClasses_; ++j) - confDiffData[j] = (confProb_->getData() + count * numClasses_)[j]; + std::copy(confProbData + count * numClasses_, + confProbData + (count + 1) * numClasses_, + confDiffData); ++count; } for (size_t i = 0; i < allNegIndices_[n].size(); ++i) { int idx = allNegIndices_[n][i]; real* confDiffData = confBuffer_->getData() + n * numPriors_ * numClasses_ + idx * numClasses_; - for (size_t j = 0; j < numClasses_; ++j) - confDiffData[j] = (confProb_->getData() + count * numClasses_)[j]; + std::copy(confProbData + count * numClasses_, + confProbData + (count + 1) * numClasses_, + confDiffData); ++count; } } diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index c46b335d99..17f6704ea1 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1679,7 +1679,7 @@ class PriorBoxLayer(LayerBase): @config_layer('multibox_loss') class MultiBoxLossLayer(LayerBase): def __init__(self, name, inputs, input_num, num_classes, overlap_threshold, - neg_pos_ratio, neg_overlap, background_id): + neg_pos_ratio, neg_overlap, background_id, **xargs): super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0, inputs) config_assert( @@ -1701,7 +1701,7 @@ class MultiBoxLossLayer(LayerBase): class DetectionOutputLayer(LayerBase): def __init__(self, name, inputs, size, input_num, num_classes, nms_threshold, nms_top_k, keep_top_k, confidence_threshold, - background_id): + background_id, **xargs): super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0, inputs) config_assert( diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 770559dc77..1286ed198e 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -1092,22 +1092,19 @@ def multibox_loss_layer(input_loc, :type background_id: int :return: LayerOutput """ - input_loc_num = 0 - input_conf_num = 0 - if isinstance(input_loc, LayerOutput): input_loc = [input_loc] assert isinstance(input_loc, collections.Sequence) # list or tuple for each in input_loc: assert isinstance(each, LayerOutput) - input_loc_num += 1 + input_loc_num = len(input_loc) if isinstance(input_conf, LayerOutput): input_conf = [input_conf] assert isinstance(input_conf, collections.Sequence) # list or tuple for each in input_conf: assert isinstance(each, LayerOutput) - input_conf_num += 1 + input_conf_num = len(input_conf) # Check the input layer number. assert input_loc_num == input_conf_num From 0fa409246b98c636d4dd32553782ca962f70a6f7 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 29 Jun 2017 09:43:00 +0000 Subject: [PATCH 07/79] fix bugs --- go/master/c/client.go | 18 ++++++++++++++++-- go/master/client.go | 21 +++++++++++++++------ go/master/client_test.go | 18 ++++++++++++++---- python/paddle/v2/reader/creator.py | 6 ++---- 4 files changed, 47 insertions(+), 16 deletions(-) diff --git a/go/master/c/client.go b/go/master/c/client.go index b88911b858..79e13e4b63 100644 --- a/go/master/c/client.go +++ b/go/master/c/client.go @@ -13,6 +13,7 @@ typedef int paddle_master_client; import "C" import ( + "io" "sync" "unsafe" @@ -84,14 +85,27 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int return C.PADDLE_MASTER_OK } +// return value: +// 0:ok +// -1:EOF +// -2:error //export paddle_next_record func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { c := get(client) - r := c.NextRecord() - if r == nil { + r, err := c.NextRecord() + if err == io.EOF { // EOF + *record = (*C.uchar)(nullPtr) return -1 } + + if err != nil { + // Error + // TODO: return the type of error? + *record = (*C.uchar)(nullPtr) + return -2 + } + if len(r) == 0 { // Empty record *record = (*C.uchar)(nullPtr) diff --git a/go/master/client.go b/go/master/client.go index fa479338c5..c122d17c8f 100644 --- a/go/master/client.go +++ b/go/master/client.go @@ -1,6 +1,7 @@ package master import ( + "io" "os" "time" @@ -17,7 +18,12 @@ type Addresser interface { // Client is the client of the master server. type Client struct { conn *connection.Conn - ch chan []byte + ch chan record +} + +type record struct { + r []byte + err error } // NewClient creates a new Client. @@ -27,7 +33,7 @@ type Client struct { func NewClient(addr Addresser, bufSize int) *Client { c := &Client{} c.conn = connection.New() - c.ch = make(chan []byte, bufSize) + c.ch = make(chan record, bufSize) go c.monitorMaster(addr) go c.getRecords() return c @@ -52,18 +58,20 @@ func (c *Client) getRecords() { s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1) for s.Scan() { - c.ch <- s.Record() + c.ch <- record{s.Record(), nil} } if s.Err() != nil { + c.ch <- record{nil, s.Err()} log.Errorln(err, chunk.Path) } err = f.Close() - c.ch <- nil if err != nil { log.Errorln(err) } + + c.ch <- record{nil, io.EOF} } // We treat a task as finished whenever the last data @@ -133,6 +141,7 @@ func (c *Client) taskFinished(taskID int) error { // // NextRecord will block until the next record is available. It is // thread-safe. -func (c *Client) NextRecord() []byte { - return <-c.ch +func (c *Client) NextRecord() ([]byte, error) { + r := <-c.ch + return r.r, r.err } diff --git a/go/master/client_test.go b/go/master/client_test.go index 85a86761c2..05201941e3 100644 --- a/go/master/client_test.go +++ b/go/master/client_test.go @@ -2,6 +2,7 @@ package master_test import ( "fmt" + "io" "net" "net/http" "net/rpc" @@ -69,13 +70,22 @@ func TestNextRecord(t *testing.T) { for pass := 0; pass < 50; pass++ { received := make(map[byte]bool) - for i := 0; i < total; i++ { - r := c.NextRecord() + for i := 0; i <= total; i++ { + r, err := c.NextRecord() + if err == io.EOF { + break + } + + if err != nil { + t.Fatal(pass, i, "Read error:", err) + } + if len(r) != 1 { - t.Fatal("Length should be 1.", r) + t.Fatal(pass, i, "Length should be 1.", r) } + if received[r[0]] { - t.Fatal("Received duplicate.", received, r) + t.Fatal(pass, i, "Received duplicate.", received, r) } received[r[0]] = true } diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 3376d7accb..b575f57dc6 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -79,7 +79,6 @@ def recordio_local(paths): return reader - def recordio(paths, addr="", buf_size=100): """ Creates a data reader that outputs record one one by one @@ -90,8 +89,8 @@ def recordio(paths, addr="", buf_size=100): import os import paddle.v2.master.client as cloud - if len(os.environ["KUBERNETES_SERVICE_HOST"]) == 0: - return recordio_local(path) + if "KUBERNETES_SERVICE_HOST" not in os.environ.keys(): + return recordio_local(paths) def reader(): c = cloud(addr, buf_size) @@ -106,4 +105,3 @@ def recordio(paths, addr="", buf_size=100): c.close() return reader - From b79784ee9e0fd67933d4793e8ab4564f7a30c780 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 29 Jun 2017 09:52:21 +0000 Subject: [PATCH 08/79] fix bugs --- python/paddle/v2/master/client.py | 18 ++++++++++++++---- python/paddle/v2/reader/creator.py | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py index 9fd3ef0860..0cc01b7310 100644 --- a/python/paddle/v2/master/client.py +++ b/python/paddle/v2/master/client.py @@ -26,17 +26,27 @@ class client(object): holder[idx] = c_ptr lib.paddle_set_dataset(self.c, holder, len(paths)) + # return format: (record, errno) + # errno = 0: ok + # = -1: EOF + # < -1: error def next_record(self): p = ctypes.c_char_p() ret = ctypes.pointer(p) size = lib.paddle_next_record(self.c, ret) - if size < 0: + if size == -1: # EOF - return None + return None, -1 + + if size < -1: + # Error + return None, size + if size == 0: # Empty record - return "" + return "", 0 + record = ret.contents.value[:size] # Memory created from C should be freed. lib.mem_free(ret.contents) - return record + return record, 0 diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index b575f57dc6..2e8626e565 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -97,7 +97,7 @@ def recordio(paths, addr="", buf_size=100): c.set_dataset(paths) while True: - r = client.next_record() + r, err = client.next_record() if r is None: break yield r From 3a0919bab31fd64ea6ae73a61755b92c619a411e Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 29 Jun 2017 21:31:42 +0800 Subject: [PATCH 09/79] Add test for configuration and add doc. --- doc/api/v2/config/layer.rst | 13 ++++++++++ .../tests/configs/file_list.sh | 2 +- .../configs/test_detection_output_layer.py | 23 +++++++++++++++++ .../tests/configs/test_multibox_loss_layer.py | 25 +++++++++++++++++++ 4 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index c7b017bc07..0a8465919d 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -445,6 +445,11 @@ smooth_l1_cost .. autoclass:: paddle.v2.layer.smooth_l1_cost :noindex: +multibox_loss +-------------- +.. autoclass:: paddle.v2.layer.multibox_loss + :noindex: + Check Layer ============ @@ -468,3 +473,11 @@ prelu -------- .. autoclass:: paddle.v2.layer.prelu :noindex: + +Detection output Layer +====================== + +detection_output +--- +.. autoclass:: paddle.v2.layer.detection_output + :noindex: diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index c24102255f..45fb848886 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -6,6 +6,6 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer -test_prelu_layer test_row_conv) +test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py new file mode 100644 index 0000000000..3572a2cb07 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py @@ -0,0 +1,23 @@ +from paddle.trainer_config_helpers import * + +settings(batch_size=1000, learning_rate=1e-5) + +input_loc = data_layer(name='input_loc', size=16, height=16, width=1) + +input_conf = data_layer(name='input_conf', size=8, height=1, width=8) + +priorbox = data_layer(name='priorbox', size=32, height=4, width=8) + +detout = detection_output_layer( + input_loc=input_loc, + input_conf=input_conf, + priorbox=priorbox, + num_classes=21, + nms_threshold=0.45, + nms_top_k=400, + keep_top_k=200, + confidence_threshold=0.01, + background_id=0, + name='test_detection_output') + +outputs(detout) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py new file mode 100644 index 0000000000..c3376c47bd --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py @@ -0,0 +1,25 @@ +from paddle.trainer_config_helpers import * + +settings(batch_size=1000, learning_rate=1e-5) + +input_loc = data_layer(name='input_loc', size=16, height=16, width=1) + +input_conf = data_layer(name='input_conf', size=8, height=1, width=8) + +priorbox = data_layer(name='priorbox', size=32, height=4, width=8) + +label = data_layer(name='label', size=24, height=4, width=6) + +multibox_loss = multibox_loss_layer( + input_loc=input_loc, + input_conf=input_conf, + priorbox=priorbox, + label=label, + num_classes=21, + overlap_threshold=0.5, + neg_pos_ratio=3.0, + neg_overlap=0.5, + background_id=0, + name='test_multibox_loss') + +outputs(multibox_loss) From b3c5808e13bc94fbc933c803c59fed979a11f515 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 30 Jun 2017 03:11:57 +0000 Subject: [PATCH 10/79] rm cloud EOF --- go/master/c/client.go | 7 ------- go/master/client.go | 3 --- go/master/client_test.go | 7 +------ python/paddle/v2/master/client.py | 5 ----- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/go/master/c/client.go b/go/master/c/client.go index 79e13e4b63..a37894fefe 100644 --- a/go/master/c/client.go +++ b/go/master/c/client.go @@ -13,7 +13,6 @@ typedef int paddle_master_client; import "C" import ( - "io" "sync" "unsafe" @@ -93,12 +92,6 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { c := get(client) r, err := c.NextRecord() - if err == io.EOF { - // EOF - *record = (*C.uchar)(nullPtr) - return -1 - } - if err != nil { // Error // TODO: return the type of error? diff --git a/go/master/client.go b/go/master/client.go index c122d17c8f..985b96b0af 100644 --- a/go/master/client.go +++ b/go/master/client.go @@ -1,7 +1,6 @@ package master import ( - "io" "os" "time" @@ -70,8 +69,6 @@ func (c *Client) getRecords() { if err != nil { log.Errorln(err) } - - c.ch <- record{nil, io.EOF} } // We treat a task as finished whenever the last data diff --git a/go/master/client_test.go b/go/master/client_test.go index 05201941e3..0a401d8a43 100644 --- a/go/master/client_test.go +++ b/go/master/client_test.go @@ -2,7 +2,6 @@ package master_test import ( "fmt" - "io" "net" "net/http" "net/rpc" @@ -70,12 +69,8 @@ func TestNextRecord(t *testing.T) { for pass := 0; pass < 50; pass++ { received := make(map[byte]bool) - for i := 0; i <= total; i++ { + for i := 0; i < total; i++ { r, err := c.NextRecord() - if err == io.EOF { - break - } - if err != nil { t.Fatal(pass, i, "Read error:", err) } diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py index 0cc01b7310..6ddb09e4e8 100644 --- a/python/paddle/v2/master/client.py +++ b/python/paddle/v2/master/client.py @@ -28,16 +28,11 @@ class client(object): # return format: (record, errno) # errno = 0: ok - # = -1: EOF # < -1: error def next_record(self): p = ctypes.c_char_p() ret = ctypes.pointer(p) size = lib.paddle_next_record(self.c, ret) - if size == -1: - # EOF - return None, -1 - if size < -1: # Error return None, size From 97bbd179569f48bfcf1a3ff3225c331ad8e3fbf4 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 30 Jun 2017 03:14:29 +0000 Subject: [PATCH 11/79] rm cloud EOF --- go/master/c/client.go | 1 - 1 file changed, 1 deletion(-) diff --git a/go/master/c/client.go b/go/master/c/client.go index a37894fefe..13ed3b7680 100644 --- a/go/master/c/client.go +++ b/go/master/c/client.go @@ -86,7 +86,6 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int // return value: // 0:ok -// -1:EOF // -2:error //export paddle_next_record func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { From 26e661bc51e2fac36c3692d748b7db8a950cb370 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 3 Jul 2017 03:05:36 +0000 Subject: [PATCH 12/79] fix by helin's comments --- go/master/c/client.go | 4 ++-- python/paddle/v2/master/client.py | 4 ++-- python/paddle/v2/reader/creator.py | 34 ++++++++++++++++++------------ 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/go/master/c/client.go b/go/master/c/client.go index 635688f196..31f4311974 100644 --- a/go/master/c/client.go +++ b/go/master/c/client.go @@ -106,7 +106,7 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int // return value: // 0:ok -// -2:error +// -1:error //export paddle_next_record func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { c := get(client) @@ -115,7 +115,7 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { // Error // TODO: return the type of error? *record = (*C.uchar)(nullPtr) - return -2 + return -1 } if len(r) == 0 { diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py index 6ddb09e4e8..70f9e43c96 100644 --- a/python/paddle/v2/master/client.py +++ b/python/paddle/v2/master/client.py @@ -28,12 +28,12 @@ class client(object): # return format: (record, errno) # errno = 0: ok - # < -1: error + # < 0: error def next_record(self): p = ctypes.c_char_p() ret = ctypes.pointer(p) size = lib.paddle_next_record(self.c, ret) - if size < -1: + if size < 0: # Error return None, size diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 2e8626e565..20624d5286 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -57,29 +57,31 @@ def text_file(path): return reader -def recordio_local(paths): +def recordio_local(paths, buf_size=100): """ - Creates a data reader that outputs record one one by one - from given local recordio fils path. + Creates a data reader from given RecordIO file paths separated by ",", + glob pattern is supported. :path: path of recordio files. :returns: data reader of recordio files. """ import recordio as rec + import paddle.v2.reader.decorator as dec def reader(): - for i, path in enumerate(paths): - f = rec.reader(path) - while True: - r = f.read() - if r is None: - break - yield r - f.close() + a = ','.join(paths) + f = rec.reader(a) + while True: + r = f.read() + if r is None: + break + yield r + f.close() + + return dec.buffered(reader, buf_size) - return reader -def recordio(paths, addr="", buf_size=100): +def recordio(paths, buf_size=100): """ Creates a data reader that outputs record one one by one from given local or cloud recordio path. @@ -92,6 +94,12 @@ def recordio(paths, addr="", buf_size=100): if "KUBERNETES_SERVICE_HOST" not in os.environ.keys(): return recordio_local(paths) + host_name = "MASTER_SERVICE_HOST" + if host_name not in os.environ.keys(): + raise Exception('not find ' + host_name + ' in environ.') + + addr = os.environ(host) + def reader(): c = cloud(addr, buf_size) c.set_dataset(paths) From 16b8e59e1ab8cb33d175ce6d4bfe3f19419acb06 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 3 Jul 2017 15:32:51 +0800 Subject: [PATCH 13/79] Update new authors --- AUTHORS.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/AUTHORS.md b/AUTHORS.md index d5baee2161..08eaab10ea 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,5 +1,23 @@ | Github account | name | |---|---| +| beckett1124 | Bin Qi | +| Canpio | Jiayi Feng | +| chengxiaohua1105 | Xiaohua Cheng | +| xushaoyong | Shaoyong Xu | +| liuyuan | Yuan Liu | +| xujun05 | Jun Xu | +| dzhwinter | Zhihong Dong | +| Guo Sheng | Sheng Guo | +| kuke | Yibing Liu | +| llxxxll | YongFeng Liu | +| cxysteven | Xingyi Cheng | +| NHZlX | Zhaolong Xing | +| pakchoi | Chuanjiang Song | +| pkuyym | Yaming Yang | +| Superjom | Chunwei Yan | +| wanghaoshuang | Haoshuang Wang | +| wangzhen-nlp | Zhen Wang | +| wwhu | Weiwei Hu | | reyoung | Yang Yu | | gangliao | Gang Liao | | luotao01 | Tao Luo | From 696ba1d2e1f3fdac763c4dd29b5353b512f9b7fa Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 3 Jul 2017 16:01:50 +0800 Subject: [PATCH 14/79] init tensor_test.cc --- paddle/framework/CMakeLists.txt | 1 + paddle/framework/tensor.h | 5 +-- paddle/framework/tensor_test.cc | 71 +++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 paddle/framework/tensor_test.cc diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 6aa6b9bc2d..41bf3837aa 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -2,6 +2,7 @@ cc_library(ddim SRCS ddim.cc) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) +cc_test(tensor_test SRCS tensor_test.cc DEPS ddim) cc_test(variable_test SRCS variable_test.cc) cc_test(scope_test SRCS scope_test.cc) cc_test(enforce_test SRCS enforce_test.cc) diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 067f2a8526..8d658d5097 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -19,13 +19,12 @@ namespace framework { class Tensor { using paddle::platform::Place; - using paddle::platform::get_place; public: template const T* data() const { - PADDLE_ASSERT(holder_ != nullptr, - "Tensor::data must be called after Tensor::mutable_data"); + PADDLE_ENFORCE(holder_ != nullptr, + "Tensor::data must be called after Tensor::mutable_data"); return static_cast(holder->Ptr()); } diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc new file mode 100644 index 0000000000..fa44b24b64 --- /dev/null +++ b/paddle/framework/tensor_test.cc @@ -0,0 +1,71 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "paddle/framework/tensor.h" +#include + +TEST(Tensor, Data) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor cpu_tensor; +} + +/* mutable_data() is not tested at present + because Memory::Alloc() and Memory::Free() have not been ready. + +TEST(Tensor, MutableData) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor cpu_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = cpu_tensor.mutable_data(make_ddim({1, 2, 3}), CPUPlace()); + EXPECT_NE(p1, nullptr); + // set cpu_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = cpu_tensor.mutable_data(make_ddim({3, 4})); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set cpu_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = cpu_tensor.mutable_data(make_ddim({2, 2, 3})); + EXPECT_EQ(p1, p2); + // set cpu_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = cpu_tensor.mutable_data(make_ddim({2, 2})); + EXPECT_EQ(p1, p2); + + Tensor gpu_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = gpu_tensor.mutable_data(make_ddim({1, 2, 3}), GPUPlace()); + EXPECT_NE(p1, nullptr); + // set gpu_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = gpu_tensor.mutable_data(make_ddim({3, 4})); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set gpu_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = gpu_tensor.mutable_data(make_ddim({2, 2, 3})); + EXPECT_EQ(p1, p2); + // set gpu_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = gpu_tensor.mutable_data(make_ddim({2, 2})); + EXPECT_EQ(p1, p2); +} +*/ \ No newline at end of file From 9f408dfb1b81daee795d9c0d8ed177e6ab4e10a8 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 3 Jul 2017 16:20:25 +0800 Subject: [PATCH 15/79] fix some compile error --- paddle/framework/tensor.h | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 8d658d5097..7fa662fbb5 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -14,32 +14,39 @@ limitations under the License. */ #pragma once +#include +#include +#include +#include "paddle/framework/ddim.h" +#include "paddle/framework/enforce.h" +#include "paddle/memory/memory.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/place.h" + namespace paddle { namespace framework { class Tensor { - using paddle::platform::Place; - public: template const T* data() const { PADDLE_ENFORCE(holder_ != nullptr, "Tensor::data must be called after Tensor::mutable_data"); - return static_cast(holder->Ptr()); + return static_cast(holder_->Ptr()); } template ::value>::type> - T* mutable_data(DDim dims, Place place) { + typename std::enable_if::value>::type* = nullptr> + T* mutable_data(DDim dims, paddle::platform::Place place) { if (holder_ == nullptr || holder_->Place() != place || - holder_->Size() < dims.product() * sizeof(T)) { - holder_.reset(new PlaceholderImpl(place, dims.product() * sizeof(T))); + holder_->Size() < product(dims) * sizeof(T)) { + holder_.reset(new PlaceholderImpl(place, product(dims) * sizeof(T))); } return static_cast(holder_->Ptr()); } template ::value>::type> + typename std::enable_if::value>::type* = nullptr> T* mutable_data(DDim dims) { return mutable_data(dims, paddle::platform::get_place()); } @@ -50,24 +57,24 @@ class Tensor { struct Placeholder { virtual ~Placeholder() {} virtual void* Ptr() const = 0; - virtual Place Place() const = 0; + virtual paddle::platform::Place Place() const = 0; virtual size_t Size() const = 0; }; template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(Place pl, size_t size) + PlaceholderImpl(paddle::platform::Place pl, size_t size) : ptr_(paddle::memory::Alloc(pl, size), paddle::memory::Deleter(pl)), place_(pl), size_(size) {} virtual void* Ptr() const { return static_cast(ptr_.get()); } virtual size_t Size() const { return size_; } - virtual Place Place() const { return place_; } + virtual paddle::platform::Place Place() const { return place_; } std::unique_ptr ptr_; - Place place_; // record the place of ptr_. - size_t size_; // size of the memory block. + paddle::platform::Place place_; // record the place of ptr_. + size_t size_; // size of the memory block. }; std::unique_ptr holder_; // holds the memory block if allocated. From bdd27208778e82ca037b2b3f6d25337403db4092 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 3 Jul 2017 16:26:33 +0800 Subject: [PATCH 16/79] Add OpProto implementation OpProto is a proto message that helps 3rd-party language bindings, e.g. `Python`, to generate operator creation methods. The operator creation method is the low-level API for 3rd-party language bindings. Op creation methods take the user's input in that language, and convert users inputs into `OpDesc` message, then passing that `OpDesc` message to Paddle's C++ core and create an operator. * A separated `attr_type.proto` is added, because that file wound be included by `op_desc.proto` in future. --- paddle/framework/CMakeLists.txt | 1 + paddle/framework/attr_type.proto | 28 +++++++++++++ paddle/framework/op_proto.proto | 69 ++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 paddle/framework/attr_type.proto create mode 100644 paddle/framework/op_proto.proto diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 6aa6b9bc2d..3284015908 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -5,3 +5,4 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_test(variable_test SRCS variable_test.cc) cc_test(scope_test SRCS scope_test.cc) cc_test(enforce_test SRCS enforce_test.cc) +proto_library(op_proto SRCS op_proto.proto attr_type.proto) diff --git a/paddle/framework/attr_type.proto b/paddle/framework/attr_type.proto new file mode 100644 index 0000000000..2d8e0476d7 --- /dev/null +++ b/paddle/framework/attr_type.proto @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax="proto2"; +package paddle.framework; + +// Attribute Type for paddle's Op. +// Op contains many attributes. Each type of attributes could be different. +// The AttrType will be shared between AttrDesc and AttrProto. +enum AttrType { + INT = 0; + FLOAT = 1; + STRING = 2; + INTS = 3; + FLOATS = 4; + STRINGS = 5; +} \ No newline at end of file diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto new file mode 100644 index 0000000000..22df6f9c6b --- /dev/null +++ b/paddle/framework/op_proto.proto @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// Protocol Message for 3rd-party language binding. +// +// Paddle Python package will use `OpProto` to generate op creation methods. +// The op creation methods take user's input and generate `OpDesc` proto message, +// then pass `OpDesc` to C++ side and create Op pointer. +// +syntax="proto2"; +package paddle.framework; + +import "attr_type.proto"; + +// Attribute protocol message for 3rd-party language binding. +// It will store the Op support what attribute and what type. +message AttrProto { + // Supported attribute name. e.g. `scale` for cosine op. + required string name = 1; + + // Supported attribute type. + required AttrType type = 2; + + // Supported attribute comments. It helps 3rd-party language generate doc-string. + required string comment = 3; +} + +// Input or output message for 3rd-party language binding. +// It contains parameter name and its comments. +message VarProto { + // Input or output name in that op creation function. + // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names. + required string name = 1; + + // The comment for that input. It helps 3rd-party language generate doc-string. + required string comment = 2; +} + +// Op protocol message for 3rd-party language binding. +// It contains all information for generating op creation method. +message OpProto { + // The input information to generate op creation method. + repeated VarProto inputs = 1; + + // The output information to generate op creation method. + repeated VarProto outputs = 2; + + // The attribute information to generate op creation method. + repeated AttrProto attrs = 3; + + // The comments for that Op. It helps 3rd-party language generate + // doc-string. The whole documentation of that Op is generated by comment, + // inputs, outputs, attrs together. + required string comment = 4; + + // The type of that Op. + required string type = 5; +} From c9cd5b6e9dd9c92ae236709c61e3cde7a17ee2b9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 3 Jul 2017 16:39:36 +0800 Subject: [PATCH 17/79] Update Authors.md --- AUTHORS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AUTHORS.md b/AUTHORS.md index 08eaab10ea..09698ac140 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -18,6 +18,8 @@ | wanghaoshuang | Haoshuang Wang | | wangzhen-nlp | Zhen Wang | | wwhu | Weiwei Hu | +| xinghai-sun | XingHai Sun | +| zhaopu7 | Pu Zhao | | reyoung | Yang Yu | | gangliao | Gang Liao | | luotao01 | Tao Luo | From 9bf98168281952efee1ed5fd1a61b743b0847834 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 3 Jul 2017 16:47:11 +0800 Subject: [PATCH 18/79] Add OpProto unittest. --- paddle/framework/CMakeLists.txt | 4 +++- paddle/framework/op_proto_test.cc | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 paddle/framework/op_proto_test.cc diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 3284015908..50107faaed 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -5,4 +5,6 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_test(variable_test SRCS variable_test.cc) cc_test(scope_test SRCS scope_test.cc) cc_test(enforce_test SRCS enforce_test.cc) -proto_library(op_proto SRCS op_proto.proto attr_type.proto) +proto_library(attr_type SRCS attr_type.proto) +proto_library(op_proto SRCS op_proto.proto) +cc_test(op_proto_test SRCS op_proto_test.cc DEPS attr_type op_proto protobuf) diff --git a/paddle/framework/op_proto_test.cc b/paddle/framework/op_proto_test.cc new file mode 100644 index 0000000000..9c054bde44 --- /dev/null +++ b/paddle/framework/op_proto_test.cc @@ -0,0 +1,31 @@ +#include +#include + +TEST(TestOpProto, ALL) { + paddle::framework::OpProto proto; + { + auto ipt = proto.mutable_inputs()->Add(); + *ipt->mutable_name() = "a"; + *ipt->mutable_comment() = "the one input of cosine op"; + } + { + auto ipt = proto.mutable_inputs()->Add(); + *ipt->mutable_name() = "b"; + *ipt->mutable_comment() = "the other input of cosine op"; + } + { + auto opt = proto.mutable_outputs()->Add(); + *opt->mutable_name() = "output"; + *opt->mutable_comment() = "the output of cosine op"; + } + { + auto attr = proto.mutable_attrs()->Add(); + *attr->mutable_name() = "scale"; + attr->set_type(paddle::framework::AttrType::FLOAT); + *attr->mutable_comment() = "the scale attribute of cosine op"; + } + proto.set_type("cos"); + *proto.mutable_comment() = "cosine op, output = scale * cos(a, b)"; + + ASSERT_TRUE(proto.IsInitialized()); +} \ No newline at end of file From 0e61730039b11861d5a90188987bad2241a08f95 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 3 Jul 2017 12:05:38 +0800 Subject: [PATCH 19/79] stride pooling for max and average layer --- paddle/gserver/layers/MaxLayer.h | 5 ++ .../layers/SequenceLastInstanceLayer.cpp | 3 +- paddle/gserver/layers/SequencePoolLayer.cpp | 5 +- paddle/gserver/layers/SequencePoolLayer.h | 2 - paddle/gserver/tests/test_LayerGrad.cpp | 12 ++++- paddle/parameter/Argument.cpp | 6 +-- paddle/parameter/Argument.h | 2 +- paddle/parameter/tests/test_argument.cpp | 4 +- python/paddle/trainer/config_parser.py | 8 +++ .../paddle/trainer_config_helpers/layers.py | 12 +++++ .../protostr/test_sequence_pooling.protostr | 51 +++++++++++++++++++ .../tests/configs/test_sequence_pooling.py | 8 +++ 12 files changed, 103 insertions(+), 15 deletions(-) diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h index baa58ca2d7..adf7ab4ae4 100644 --- a/paddle/gserver/layers/MaxLayer.h +++ b/paddle/gserver/layers/MaxLayer.h @@ -26,6 +26,11 @@ namespace paddle { * If SequenceLevel = kNonSeq: * Output: output size is the number of input sequences (NOT input instances) * output[i] = max_{for each instance in this sequence}{input[i]} + * If stride_ > 0: + * Output: a shorten sequence. The operation of getting max instance of a + * sequence is independently performed on every slice of the input + * sequence, which is obtained by sliding a window with the window + * size set to stride_. * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: output size is the number of input sub-sequences diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp index 944c705166..8127cbf09c 100644 --- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp +++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp @@ -73,8 +73,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap, void SequenceLastInstanceLayer::forward(PassType passType) { SequencePoolLayer::forward(passType); - auto starts = (stride_ > 0) ? stridePositions_->getData() - : startPositions_->getData(false); + auto starts = startPositions_->getData(false); MatrixPtr inputValue = getInputValue(0); MatrixPtr outputValue = getOutputValue(); diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp index 4179a9e7e0..2a693b110a 100644 --- a/paddle/gserver/layers/SequencePoolLayer.cpp +++ b/paddle/gserver/layers/SequencePoolLayer.cpp @@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) { if (stride_ > 0) { CHECK_EQ(input.hasSubseq(), 0UL) << "sequence stride pooling is invalid for hasSubseq now"; - output_.poolSequenceWithStride( - input, stride_, &stridePositions_, reversed_); - newBatchSize_ = stridePositions_->getSize() - 1; + output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_); + newBatchSize_ = startPositions_->getSize() - 1; } resetOutput(newBatchSize_, dim); diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h index 293d1bf278..058627def8 100644 --- a/paddle/gserver/layers/SequencePoolLayer.h +++ b/paddle/gserver/layers/SequencePoolLayer.h @@ -47,8 +47,6 @@ protected: size_t newBatchSize_; ICpuGpuVectorPtr startPositions_; int stride_; - // Store the start position of each window. - IVectorPtr stridePositions_; // Whether the input sequence is reversed or not. bool reversed_ = false; diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 297756025b..ed067e7c3a 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -845,8 +845,12 @@ void testDegradeLayer(bool hasSubseq, TEST(Layer, MaxLayer) { testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq - testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq - testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq + testDegradeLayer(false, + "max", + "non-seq", + 5); // seq max to a shorten seq, stride window = 5 + testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq + testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq } TEST(Layer, SequenceLastInstanceLayer) { @@ -868,6 +872,10 @@ TEST(Layer, SequenceLastInstanceLayer) { TEST(Layer, AverageLayer) { testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq + testDegradeLayer(false, + "max", + "non-seq", + 5); // seq average to a shorten seq, stride window = 5 testDegradeLayer( true, "average", "non-seq", -1); // hasSubseq average to non-seq testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 5beced3bb5..ef72b973c1 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) { void Argument::poolSequenceWithStride(const Argument& input, size_t stride, - IVectorPtr* stridePostions, + ICpuGpuVectorPtr* stridePostions, bool reversed) { // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5, // then sequenceStartPositions = [0, 2, 3, 4, 7]. @@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input, stridePos.emplace_back(starts[numSequences]); int size = stridePos.size(); CHECK_EQ(size - 1, tgtBuf[numSequences]); - IVector::resizeOrCreate(*stridePostions, size, false); - (*stridePostions)->copyFrom(stridePos.data(), size); + ICpuGpuVector::resizeOrCreate(*stridePostions, size, false); + (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size); } void Argument::getValueString( diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 09bd633616..0ccdef802e 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -299,7 +299,7 @@ struct Argument { */ void poolSequenceWithStride(const Argument& input, size_t stride, - IVectorPtr* stridePositions, + ICpuGpuVectorPtr* stridePositions, bool reversed = false); /** * @brief getValueString will return the argument's output in string. There diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp index 98ab013548..19df6ea957 100644 --- a/paddle/parameter/tests/test_argument.cpp +++ b/paddle/parameter/tests/test_argument.cpp @@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) { int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30}; for (auto reversed : {false, true}) { - IVectorPtr stridePositions; + ICpuGpuVectorPtr stridePositions; output.poolSequenceWithStride( input, 5 /* stride */, &stridePositions, reversed); @@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) { CHECK_EQ(stridePositions->getSize(), 8UL); auto result = reversed ? strideResultReversed : strideResult; for (int i = 0; i < 8; i++) { - CHECK_EQ(stridePositions->getData()[i], result[i]); + CHECK_EQ(stridePositions->getData(false)[i], result[i]); } } } diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index b7418101d8..5ca7df7476 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2420,10 +2420,14 @@ class MaxLayer(LayerBase): trans_type='non-seq', bias=False, output_max_index=None, + stride=-1, **xargs): super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs) config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input') + if trans_type == 'seq': + config_assert(stride == -1, 'subseq does not support stride window') self.config.trans_type = trans_type + self.config.seq_pool_stride = stride for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) self.set_layer_size(input_layer.size) @@ -2685,11 +2689,15 @@ class AverageLayer(LayerBase): average_strategy='average', trans_type='non-seq', bias=False, + stride=-1, **xargs): super(AverageLayer, self).__init__( name, 'average', 0, inputs=inputs, **xargs) self.config.average_strategy = average_strategy + if trans_type == 'seq': + config_assert(stride == -1, 'subseq does not support stride window') self.config.trans_type = trans_type + self.config.seq_pool_stride = stride config_assert(len(inputs) == 1, 'AverageLayer must have 1 input') for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index a601d5c84a..5e8bf4b203 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -1090,10 +1090,16 @@ def pooling_layer(input, name=None, bias_attr=None, agg_level=AggregateLevel.TO_NO_SEQUENCE, + stride=-1, layer_attr=None): """ Pooling layer for sequence inputs, not used for Image. + If stride > 0, this layer slides a window whose size is determined by stride, + and return the pooling value of the window as the output. Thus, a long sequence + will be shorten. Note that for sequence with sub-sequence, the default value + of stride is -1. + The example usage is: .. code-block:: python @@ -1112,6 +1118,8 @@ def pooling_layer(input, :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling, SumPooling, SquareRootNPooling. :type pooling_type: BasePoolingType|None + :param stride: window size. + :type stride: Int :param bias_attr: Bias parameter attribute. False if no bias. :type bias_attr: ParameterAttribute|None|False :param layer_attr: The Extra Attributes for layer, such as dropout. @@ -1129,12 +1137,16 @@ def pooling_layer(input, extra_dict['output_max_index'] = pooling_type.output_max_index extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr)) + if agg_level == AggregateLevel.TO_SEQUENCE: + assert stride == -1 + Layer( name=name, type=pooling_type.name, inputs=[Input(input.name)], bias=ParamAttr.to_bias(bias_attr), trans_type=agg_level, + stride=stride, **extra_dict) return LayerOutput( diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr index 5a217f5544..8989561df0 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr @@ -14,6 +14,7 @@ layers { input_layer_name: "dat_in" } trans_type: "seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_1__" @@ -24,6 +25,7 @@ layers { input_layer_name: "dat_in" } trans_type: "non-seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_2__" @@ -35,6 +37,7 @@ layers { } average_strategy: "average" trans_type: "seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_3__" @@ -46,6 +49,7 @@ layers { } average_strategy: "average" trans_type: "non-seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_4__" @@ -57,6 +61,7 @@ layers { } average_strategy: "sum" trans_type: "seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_5__" @@ -68,6 +73,7 @@ layers { } average_strategy: "sum" trans_type: "non-seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_6__" @@ -77,8 +83,44 @@ layers { inputs { input_layer_name: "dat_in" } + trans_type: "non-seq" + seq_pool_stride: 5 +} +layers { + name: "__seq_pooling_7__" + type: "average" + size: 100 + active_type: "" + inputs { + input_layer_name: "dat_in" + } + average_strategy: "average" + trans_type: "non-seq" + seq_pool_stride: 5 +} +layers { + name: "__seq_pooling_8__" + type: "average" + size: 100 + active_type: "" + inputs { + input_layer_name: "dat_in" + } + average_strategy: "sum" + trans_type: "non-seq" + seq_pool_stride: 5 +} +layers { + name: "__seq_pooling_9__" + type: "max" + size: 100 + active_type: "" + inputs { + input_layer_name: "dat_in" + } output_max_index: true trans_type: "non-seq" + seq_pool_stride: -1 } input_layer_names: "dat_in" output_layer_names: "__seq_pooling_0__" @@ -88,6 +130,9 @@ output_layer_names: "__seq_pooling_3__" output_layer_names: "__seq_pooling_4__" output_layer_names: "__seq_pooling_5__" output_layer_names: "__seq_pooling_6__" +output_layer_names: "__seq_pooling_7__" +output_layer_names: "__seq_pooling_8__" +output_layer_names: "__seq_pooling_9__" sub_models { name: "root" layer_names: "dat_in" @@ -98,6 +143,9 @@ sub_models { layer_names: "__seq_pooling_4__" layer_names: "__seq_pooling_5__" layer_names: "__seq_pooling_6__" + layer_names: "__seq_pooling_7__" + layer_names: "__seq_pooling_8__" + layer_names: "__seq_pooling_9__" input_layer_names: "dat_in" output_layer_names: "__seq_pooling_0__" output_layer_names: "__seq_pooling_1__" @@ -106,6 +154,9 @@ sub_models { output_layer_names: "__seq_pooling_4__" output_layer_names: "__seq_pooling_5__" output_layer_names: "__seq_pooling_6__" + output_layer_names: "__seq_pooling_7__" + output_layer_names: "__seq_pooling_8__" + output_layer_names: "__seq_pooling_9__" is_recurrent_layer_group: false } diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py index 3c49eb56c1..3c205eabd8 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py @@ -14,6 +14,14 @@ for pt in POOL_TYPE: for al in AGG_LEVEL: opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt())) +for pt in POOL_TYPE: + opts.append( + pooling_layer( + input=din, + agg_level=AggregateLevel.TO_NO_SEQUENCE, + pooling_type=pt(), + stride=5)) + opts.append( pooling_layer( input=din, pooling_type=MaxPooling(output_max_index=True))) From e146fe836bc5178b497329dacddc7a1dc5063bcd Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 3 Jul 2017 17:22:58 +0800 Subject: [PATCH 20/79] fix compile errors and add assert test --- paddle/framework/tensor.h | 25 ++++++-- paddle/framework/tensor_test.cc | 100 ++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 49 deletions(-) diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 7fa662fbb5..73eedd7375 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/framework/ddim.h" #include "paddle/framework/enforce.h" #include "paddle/memory/memory.h" -#include "paddle/platform/assert.h" #include "paddle/platform/place.h" namespace paddle { @@ -63,21 +62,35 @@ class Tensor { template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(paddle::platform::Place pl, size_t size) - : ptr_(paddle::memory::Alloc(pl, size), paddle::memory::Deleter(pl)), - place_(pl), + private: + class Deleter { + public: + Deleter(platform::Place place) : place_(place) {} + void operator()(T* ptr) { + paddle::memory::Free(place_, static_cast(ptr)); + } + + private: + paddle::platform::Place place_; + }; + + public: + PlaceholderImpl(paddle::platform::Place place, size_t size) + : ptr_(static_cast(paddle::memory::Alloc(place, size)), + Deleter(place)), + place_(place), size_(size) {} virtual void* Ptr() const { return static_cast(ptr_.get()); } virtual size_t Size() const { return size_; } virtual paddle::platform::Place Place() const { return place_; } - std::unique_ptr ptr_; + std::unique_ptr ptr_; paddle::platform::Place place_; // record the place of ptr_. size_t size_; // size of the memory block. }; - std::unique_ptr holder_; // holds the memory block if allocated. + std::shared_ptr holder_; // holds the memory block if allocated. }; } // namespace framework diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index fa44b24b64..f76a31e921 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -13,12 +13,23 @@ #include "paddle/framework/tensor.h" #include +#include -TEST(Tensor, Data) { - using namespace paddle::framework; - using namespace paddle::platform; +TEST(Tensor, ASSERT) { + paddle::framework::Tensor cpu_tensor; - Tensor cpu_tensor; + bool caught = false; + try { + const double* p __attribute__((unused)) = cpu_tensor.data(); + } catch (paddle::framework::EnforceNotMet err) { + caught = true; + std::string msg = "Tensor::data must be called after Tensor::mutable_data"; + const char* what = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + ASSERT_TRUE(caught); } /* mutable_data() is not tested at present @@ -27,45 +38,48 @@ TEST(Tensor, Data) { TEST(Tensor, MutableData) { using namespace paddle::framework; using namespace paddle::platform; + { + Tensor cpu_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = cpu_tensor.mutable_data(make_ddim({1, 2, 3}), CPUPlace()); + EXPECT_NE(p1, nullptr); + // set cpu_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = cpu_tensor.mutable_data(make_ddim({3, 4})); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set cpu_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = cpu_tensor.mutable_data(make_ddim({2, 2, 3})); + EXPECT_EQ(p1, p2); + // set cpu_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = cpu_tensor.mutable_data(make_ddim({2, 2})); + EXPECT_EQ(p1, p2); + } - Tensor cpu_tensor; - float* p1 = nullptr; - float* p2 = nullptr; - // initialization - p1 = cpu_tensor.mutable_data(make_ddim({1, 2, 3}), CPUPlace()); - EXPECT_NE(p1, nullptr); - // set cpu_tensor a new dim with large size - // momery is supposed to be re-allocated - p2 = cpu_tensor.mutable_data(make_ddim({3, 4})); - EXPECT_NE(p2, nullptr); - EXPECT_NE(p1, p2); - // set cpu_tensor a new dim with same size - // momery block is supposed to be unchanged - p1 = cpu_tensor.mutable_data(make_ddim({2, 2, 3})); - EXPECT_EQ(p1, p2); - // set cpu_tensor a new dim with smaller size - // momery block is supposed to be unchanged - p2 = cpu_tensor.mutable_data(make_ddim({2, 2})); - EXPECT_EQ(p1, p2); - - Tensor gpu_tensor; - float* p1 = nullptr; - float* p2 = nullptr; - // initialization - p1 = gpu_tensor.mutable_data(make_ddim({1, 2, 3}), GPUPlace()); - EXPECT_NE(p1, nullptr); - // set gpu_tensor a new dim with large size - // momery is supposed to be re-allocated - p2 = gpu_tensor.mutable_data(make_ddim({3, 4})); - EXPECT_NE(p2, nullptr); - EXPECT_NE(p1, p2); - // set gpu_tensor a new dim with same size - // momery block is supposed to be unchanged - p1 = gpu_tensor.mutable_data(make_ddim({2, 2, 3})); - EXPECT_EQ(p1, p2); - // set gpu_tensor a new dim with smaller size - // momery block is supposed to be unchanged - p2 = gpu_tensor.mutable_data(make_ddim({2, 2})); - EXPECT_EQ(p1, p2); + { + Tensor gpu_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = gpu_tensor.mutable_data(make_ddim({1, 2, 3}), GPUPlace()); + EXPECT_NE(p1, nullptr); + // set gpu_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = gpu_tensor.mutable_data(make_ddim({3, 4})); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set gpu_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = gpu_tensor.mutable_data(make_ddim({2, 2, 3})); + EXPECT_EQ(p1, p2); + // set gpu_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = gpu_tensor.mutable_data(make_ddim({2, 2})); + EXPECT_EQ(p1, p2); + } } */ \ No newline at end of file From d054a5eef806d76458f9155bf5a4ffb98ba474d3 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 3 Jul 2017 19:08:27 +0800 Subject: [PATCH 21/79] re-submit --- paddle/framework/tensor.h | 2 +- paddle/framework/tensor_test.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 73eedd7375..f777661a1c 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -30,7 +30,7 @@ class Tensor { template const T* data() const { PADDLE_ENFORCE(holder_ != nullptr, - "Tensor::data must be called after Tensor::mutable_data"); + "Tensor::data must be called after Tensor::mutable_data."); return static_cast(holder_->Ptr()); } diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index f76a31e921..727d81f8d7 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -23,7 +23,7 @@ TEST(Tensor, ASSERT) { const double* p __attribute__((unused)) = cpu_tensor.data(); } catch (paddle::framework::EnforceNotMet err) { caught = true; - std::string msg = "Tensor::data must be called after Tensor::mutable_data"; + std::string msg = "Tensor::data must be called after Tensor::mutable_data."; const char* what = err.what(); for (size_t i = 0; i < msg.length(); ++i) { ASSERT_EQ(what[i], msg[i]); @@ -82,4 +82,4 @@ TEST(Tensor, MutableData) { EXPECT_EQ(p1, p2); } } -*/ \ No newline at end of file +*/ From 2d1f95de873542ae591b4575e14539f26945b162 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 3 Jul 2017 19:33:33 +0800 Subject: [PATCH 22/79] fix a compile error --- paddle/framework/tensor.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index f777661a1c..6a152f6a6d 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -37,8 +37,10 @@ class Tensor { template ::value>::type* = nullptr> T* mutable_data(DDim dims, paddle::platform::Place place) { - if (holder_ == nullptr || holder_->Place() != place || - holder_->Size() < product(dims) * sizeof(T)) { + if (holder_ == nullptr || + !(holder_->Place() == + place) /* some versions of boost::variant don't have operator!= */ + || holder_->Size() < product(dims) * sizeof(T)) { holder_.reset(new PlaceholderImpl(place, product(dims) * sizeof(T))); } return static_cast(holder_->Ptr()); From e48e21da2b2522e4a9e1bca589d68eb02a419fb0 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 3 Jul 2017 20:14:30 +0800 Subject: [PATCH 23/79] remove unnecessary include --- paddle/framework/tensor.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 6a152f6a6d..ce5d98b04e 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include -#include #include "paddle/framework/ddim.h" #include "paddle/framework/enforce.h" #include "paddle/memory/memory.h" From 3ba7a738f3f3e77240d026db57692d66bc9481ed Mon Sep 17 00:00:00 2001 From: qijun Date: Mon, 3 Jul 2017 20:37:42 +0800 Subject: [PATCH 24/79] add dynamic_load --- paddle/platform/cublas.h | 87 +++++++++++++++++ paddle/platform/cudnn.h | 114 ++++++++++++++++++++++ paddle/platform/curand.h | 42 ++++++++ paddle/platform/dynamic_loader.cc | 157 ++++++++++++++++++++++++++++++ paddle/platform/dynamic_loader.h | 63 ++++++++++++ 5 files changed, 463 insertions(+) create mode 100644 paddle/platform/cublas.h create mode 100644 paddle/platform/cudnn.h create mode 100644 paddle/platform/curand.h create mode 100644 paddle/platform/dynamic_loader.cc create mode 100644 paddle/platform/dynamic_loader.h diff --git a/paddle/platform/cublas.h b/paddle/platform/cublas.h new file mode 100644 index 0000000000..70c9713325 --- /dev/null +++ b/paddle/platform/cublas.h @@ -0,0 +1,87 @@ +#include +#include "paddle/platform/dynamic_loader.h" + +namespace paddle { +namespace dyload { +namespace dynload { + +std::once_flag cublas_dso_flag; +void *cublas_dso_handle = nullptr; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#ifdef PADDLE_USE_DSO +#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cublasStatus_t operator()(Args... args) { \ + typedef cublasStatus_t (*cublasFunc)(Args...); \ + std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \ + void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + } __name; // struct DynLoad__##__name +#else +#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cublasStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + } __name; // struct DynLoad__##__name +#endif + +#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name) + +// include all needed cublas functions in HPPL +// clang-format off +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSgemv) \ + __macro(cublasDgemv) \ + __macro(cublasSgemm) \ + __macro(cublasDgemm) \ + __macro(cublasSgeam) \ + __macro(cublasDgeam) \ + +DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate) +DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy) +DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream) +DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode) +DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode) +DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched) +DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched) +DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched) +DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched) +DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched) +DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched) +DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched) +DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched) +CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) + +#undef DYNAMIC_LOAD_CUBLAS_WRAP +#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP +#undef CUBLAS_BLAS_ROUTINE_EACH + +} /* namespace dynload */ + +// clang-format on +#ifndef PADDLE_TYPE_DOUBLE +#define CUBLAS_GEAM dynload::cublasSgeam +#define CUBLAS_GEMV dynload::cublasSgemv +#define CUBLAS_GEMM dynload::cublasSgemm +#define CUBLAS_GETRF dynload::cublasSgetrfBatched +#define CUBLAS_GETRI dynload::cublasSgetriBatched +#else +#define CUBLAS_GEAM dynload::cublasDgeam +#define CUBLAS_GEMV dynload::cublasDgemv +#define CUBLAS_GEMM dynload::cublasDgemm +#define CUBLAS_GETRF dynload::cublasDgetrfBatched +#define CUBLAS_GETRI dynload::cublasDgetriBatched +#endif +} // namespace dyload +} // namespace paddle diff --git a/paddle/platform/cudnn.h b/paddle/platform/cudnn.h new file mode 100644 index 0000000000..ab878cd555 --- /dev/null +++ b/paddle/platform/cudnn.h @@ -0,0 +1,114 @@ +#include +#include "paddle/platform/dynamic_loader.h" + +namespace paddle { +namespace dyload { + +std::once_flag cudnn_dso_flag; +void* cudnn_dso_handle = nullptr; + +#ifdef PADDLE_USE_DSO + +#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using cudnn_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \ + void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ + +#else + +#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return __name(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ + +#endif + +/** + * include all needed cudnn functions in HPPL + * different cudnn version has different interfaces + **/ +// clang-format off +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor) \ + __macro(cudnnSetTensor4dDescriptorEx) \ + __macro(cudnnGetConvolutionNdForwardOutputDim) \ + __macro(cudnnGetConvolutionForwardAlgorithm) \ + __macro(cudnnCreateTensorDescriptor) \ + __macro(cudnnDestroyTensorDescriptor) \ + __macro(cudnnCreateFilterDescriptor) \ + __macro(cudnnSetFilter4dDescriptor) \ + __macro(cudnnSetPooling2dDescriptor) \ + __macro(cudnnDestroyFilterDescriptor) \ + __macro(cudnnCreateConvolutionDescriptor) \ + __macro(cudnnCreatePoolingDescriptor) \ + __macro(cudnnDestroyPoolingDescriptor) \ + __macro(cudnnSetConvolution2dDescriptor) \ + __macro(cudnnDestroyConvolutionDescriptor) \ + __macro(cudnnCreate) \ + __macro(cudnnDestroy) \ + __macro(cudnnSetStream) \ + __macro(cudnnActivationForward) \ + __macro(cudnnConvolutionForward) \ + __macro(cudnnConvolutionBackwardBias) \ + __macro(cudnnGetConvolutionForwardWorkspaceSize) \ + __macro(cudnnTransformTensor) \ + __macro(cudnnPoolingForward) \ + __macro(cudnnPoolingBackward) \ + __macro(cudnnSoftmaxBackward) \ + __macro(cudnnSoftmaxForward) \ + __macro(cudnnGetVersion) \ + __macro(cudnnGetErrorString) +CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ + __macro(cudnnAddTensor) \ + __macro(cudnnConvolutionBackwardData) \ + __macro(cudnnConvolutionBackwardFilter) +CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs available after R3: +#if CUDNN_VERSION >= 3000 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ + __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize) \ + __macro(cudnnGetConvolutionBackwardDataAlgorithm) \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithm) \ + __macro(cudnnGetConvolutionBackwardDataWorkspaceSize) +CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3 +#endif + + +// APIs available after R4: +#if CUDNN_VERSION >= 4007 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ + __macro(cudnnBatchNormalizationForwardTraining) \ + __macro(cudnnBatchNormalizationForwardInference) \ + __macro(cudnnBatchNormalizationBackward) +CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4 +#endif + +// APIs in R5 +#if CUDNN_VERSION >= 5000 +#define CUDNN_DNN_ROUTINE_EACH_R5(__macro) \ + __macro(cudnnCreateActivationDescriptor) \ + __macro(cudnnSetActivationDescriptor) \ + __macro(cudnnGetActivationDescriptor) \ + __macro(cudnnDestroyActivationDescriptor) +CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP) +#undef CUDNN_DNN_ROUTINE_EACH_R5 +#endif + +#undef CUDNN_DNN_ROUTINE_EACH +// clang-format on +} // namespace dyload +} // namespace paddle diff --git a/paddle/platform/curand.h b/paddle/platform/curand.h new file mode 100644 index 0000000000..692c024e6e --- /dev/null +++ b/paddle/platform/curand.h @@ -0,0 +1,42 @@ +#include +#include "paddle/platform/dynamic_loader.h" + +namespace paddle { +namespace dyload { +#ifdef PADDLE_USE_DSO +#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + typedef curandStatus_t (*curandFunc)(Args...); \ + std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \ + void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ +#else +#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ +#endif + +/* include all needed curand functions in HPPL */ +// clang-format off +#define CURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(curandCreateGenerator) \ + __macro(curandSetStream) \ + __macro(curandSetPseudoRandomGeneratorSeed)\ + __macro(curandGenerateUniform) \ + __macro(curandGenerateUniformDouble) +// clang-format on + +CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) + +#undef CURAND_RAND_ROUTINE_EACH +#undef DYNAMIC_LOAD_CURAND_WRAP +} +} // namespace paddle diff --git a/paddle/platform/dynamic_loader.cc b/paddle/platform/dynamic_loader.cc new file mode 100644 index 0000000000..9036eaf642 --- /dev/null +++ b/paddle/platform/dynamic_loader.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "DynamicLoader.h" +#include "Logging.h" + +DEFINE_string(cudnn_dir, "", + "Specify path for loading libcudnn.so. For instance, " + "/usr/local/cudnn/lib. If empty [default], dlopen " + "will search cudnn from LD_LIBRARY_PATH"); + +DEFINE_string(cuda_dir, "", + "Specify path for loading cuda library, such as libcublas, " + "libcurand. For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); + +DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); + +DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); + +static inline std::string join(const std::string& part1, + const std::string& part2) { + // directory separator + const char sep = '/'; + if (!part2.empty() && part2.front() == sep) { + return part2; + } + std::string ret; + ret.reserve(part1.size() + part2.size() + 1); + ret = part1; + if (!ret.empty() && ret.back() != sep) { + ret += sep; + } + ret += part2; + return ret; +} + +static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, + void** dso_handle, + int dynload_flags) { + VLOG(3) << "Try to find library: " << dso_path + << " from default system path."; + // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH + *dso_handle = dlopen(dso_path.c_str(), dynload_flags); + +// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to +// bring System Integrity Projection (SIP), if dso_handle +// is null, search from default package path in Mac OS. +#if defined(__APPLE__) || defined(__OSX__) + if (nullptr == *dso_handle) { + dso_path = join("/usr/local/cuda/lib/", dso_path); + *dso_handle = dlopen(dso_path.c_str(), dynload_flags); + if (nullptr == *dso_handle) { + if (dso_path == "libcudnn.dylib") { + LOG(FATAL) + << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT + << "For instance, sudo tar -xzf " + "cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT + << "/usr/local \n sudo chmod a+r " + "/usr/local/cuda/include/cudnn.h " // NOLINT + << "/usr/local/cuda/lib/libcudnn*"; + } + } + } +#endif +} + +static inline void GetDsoHandleFromSearchPath(const std::string& search_root, + const std::string& dso_name, + void** dso_handle) { + int dynload_flags = RTLD_LAZY | RTLD_LOCAL; + *dso_handle = nullptr; + + std::string dlPath = dso_name; + if (search_root.empty()) { + GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); + } else { + // search xxx.so from custom path + dlPath = join(search_root, dso_name); + *dso_handle = dlopen(dlPath.c_str(), dynload_flags); + // if not found, search from default path + if (nullptr == *dso_handle) { + LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" + << dlerror() << ")"; + dlPath = dso_name; + GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); + } + } + + CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath + << " (" << dlerror() << ") \n" + << "Please specify its path correctly using " + "following ways: \n" + + << "Method. set environment variable " + "LD_LIBRARY_PATH on Linux or " + << "DYLD_LIBRARY_PATH on Mac OS. \n" + << "For instance, issue command: export " + "LD_LIBRARY_PATH=... \n" + + << "Note: After Mac OS 10.11, using the " + "DYLD_LIBRARY_PATH is impossible " + << "unless System Integrity Protection (SIP) " + "is disabled."; +} + +void GetCublasDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle); +#endif +} + +void GetCudnnDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle); +#endif +} + +void GetCurandDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle); +#endif +} + +void GetWarpCTCDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle); +#endif +} + +void GetLapackDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle); +#endif +} diff --git a/paddle/platform/dynamic_loader.h b/paddle/platform/dynamic_loader.h new file mode 100644 index 0000000000..9b5ad21724 --- /dev/null +++ b/paddle/platform/dynamic_loader.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef DYNAMIC_LOAD_H_ +#define DYNAMIC_LOAD_H_ + +#include +#include +#include +#include + +/** + * @brief load the DSO of CUBLAS + * + * @param **dso_handle dso handler + * + */ +void GetCublasDsoHandle(void** dso_handle); + +/** + * @brief load the DSO of CUDNN + * + * @param **dso_handle dso handler + * + */ +void GetCudnnDsoHandle(void** dso_handle); + +/** + * @brief load the DSO of CURAND + * + * @param **dso_handle dso handler + * + */ +void GetCurandDsoHandle(void** dso_handle); + +/** + * @brief load the DSO of warp-ctc + * + * @param **dso_handle dso handler + * + */ +void GetWarpCTCDsoHandle(void** dso_handle); + +/** + * @brief load the DSO of lapack + * + * @param **dso_handle dso handler + * + */ +void GetLapackDsoHandle(void** dso_handle); + +#endif // DYNAMIC_LOAD_H_ From a30754b05e1ef58b5803c3d9996ed0cc69100ac5 Mon Sep 17 00:00:00 2001 From: qijun Date: Mon, 3 Jul 2017 20:41:31 +0800 Subject: [PATCH 25/79] test device_context --- paddle/platform/CMakeLists.txt | 3 + paddle/platform/device_context.h | 166 +++++++++++++++++++++++++ paddle/platform/device_context_test.cu | 29 +++++ 3 files changed, 198 insertions(+) create mode 100644 paddle/platform/device_context.h create mode 100644 paddle/platform/device_context_test.cu diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index c7d7b14518..c95b54a4df 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -2,3 +2,6 @@ nv_test(cuda_test SRCS cuda_test.cu) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) + +cc_library(dynamic_loader SRCS dynamic_loader.cc) +nv_test(device_context_test SRCS device_context_test.cu DEPS place dynamic_loader glog gflags) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h new file mode 100644 index 0000000000..f95aac4a36 --- /dev/null +++ b/paddle/platform/device_context.h @@ -0,0 +1,166 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifndef PADDLE_ONLY_CPU +#include "paddle/platform/cublas.h" +#include "paddle/platform/cuda.h" +#include "paddle/platform/cudnn.h" +#include "paddle/platform/curand.h" +#define EIGEN_USE_GPU +#endif + +#include "paddle/framework/enforce.h" +#include "paddle/platform/place.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace platform { + +class DeviceContext { + public: + virtual ~DeviceContext() {} +}; + +class CpuDeviceContext : public DeviceContext { + Eigen::DefaultDevice eigen_device() { + if (!eigen_device_) { + eigen_device_ = new Eigen::DefaultDevice(); + } + return *eigen_device_; + } + + private: + Eigen::DefaultDevice* eigen_device_{nullptr}; +}; + +#ifndef PADDLE_ONLY_CPU +class DeviceGuard { + public: + explicit DeviceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) { + if (previous_ != new_place) { + paddle::platform::SetDeviceId(new_place.device); + } + } + + ~DeviceGuard() { paddle::platform::SetDeviceId(previous_.device); } + + private: + GPUPlace previous_; +}; + +class CudaDeviceContext : public DeviceContext { + public: + explicit CudaDeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) { + DeviceGuard guard(gpu_place_); + paddle::platform::throw_on_error(cudaStreamCreate(&stream_), + "cudaStreamCreate failed"); + eigen_stream_ = new Eigen::CudaStreamDevice(&stream_); + eigen_device_ = new Eigen::GpuDevice(eigen_stream_); + } + + void Wait() { + paddle::platform::throw_on_error(cudaStreamSynchronize(stream_), + "cudaStreamSynchronize failed"); + } + + cudaStream_t stream() { return stream_; } + + Eigen::GpuDevice eigen_device() { return *eigen_device_; } + + cublasHandle_t cublas_handle() { + if (!blas_handle_) { + DeviceGuard guard(gpu_place_); + PADDLE_ENFORCE(cublasCreate(&blas_handle_) == CUBLAS_STATUS_SUCCESS, + "cublasCreate failed"); + PADDLE_ENFORCE( + cublasSetStream(blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS, + "cublasSetStream failed"); + } + return blas_handle_; + } + + cudnnHandle_t cudnn_handle() { + if (!dnn_handle_) { + DeviceGuard guard(gpu_place_); + PADDLE_ENFORCE(cudnnCreate(&dnn_handle_) == CUDNN_STATUS_SUCCESS, + "cudnnCreate failed"); + PADDLE_ENFORCE( + cudnnSetStream(dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS, + "cudnnSetStream failed"); + } + return dnn_handle_; + } + + curandGenerator_t curand_generator() { + if (!rand_generator_) { + DeviceGuard guard(gpu_place_); + PADDLE_ENFORCE( + curandCreateGenerator(&rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) == + CURAND_STATUS_SUCCESS, + "curandCreateGenerator failed"); + PADDLE_ENFORCE( + curandSetPseudoRandomGeneratorSeed(rand_generator_, random_seed_) == + CURAND_STATUS_SUCCESS, + "curandSetPseudoRandomGeneratorSeed failed"); + PADDLE_ENFORCE( + curandSetStream(rand_generator_, stream_) == CURAND_STATUS_SUCCESS, + "curandSetStream failed"); + } + return rand_generator_; + } + + ~CudaDeviceContext() { + Wait(); + if (blas_handle_) { + PADDLE_ENFORCE(cublasDestroy(blas_handle_) == CUBLAS_STATUS_SUCCESS, + "cublasDestroy failed"); + } + + if (dnn_handle_) { + PADDLE_ENFORCE(cudnnDestroy(dnn_handle_) == CUDNN_STATUS_SUCCESS, + "cudnnDestroy failed"); + } + + if (rand_generator_) { + PADDLE_ENFORCE( + curandDestroyGenerator(rand_generator_) == CURAND_STATUS_SUCCESS, + "curandDestroyGenerator failed"); + } + + delete eigen_stream_; + delete eigen_device_; + + paddle::platform::throw_on_error(cudaStreamDestroy(stream_), + "cudaStreamDestroy failed"); + } + + private: + GPUPlace gpu_place_; + cudaStream_t stream_; + + Eigen::CudaStreamDevice* eigen_stream_; + Eigen::GpuDevice* eigen_device_; + + cublasHandle_t blas_handle_{nullptr}; + + cudnnHandle_t dnn_handle_{nullptr}; + + int random_seed_; + curandGenerator_t rand_generator_{nullptr}; +}; +#endif +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu new file mode 100644 index 0000000000..a15fb53b71 --- /dev/null +++ b/paddle/platform/device_context_test.cu @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/device_context.h" +#include "gtest/gtest.h" + + +TEST(DeviceContext, CudaDevice) { + int count = paddle::platform::GetDeviceCount(); + for (int i = 0; i < count; i++) { + paddle::platform::CudaDeviceContext* device_context = new paddle::platform::CudaDeviceContext(i); + __attribute__((unused)) Eigen::GpuDevice gpu_device = device_context->eigen_device(); + __attribute__((unused)) cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); + __attribute__((unused)) cublasHandle_t cublas_handle = device_context->cublas_handle(); + __attribute__((unused)) curandGenerator_t curand_handle = device_context->curand_generator(); + delete device_context; + } +} From a77fcef3f99724e85e2239ad91683b7afe913cd8 Mon Sep 17 00:00:00 2001 From: qijun Date: Mon, 3 Jul 2017 12:55:39 +0000 Subject: [PATCH 26/79] fix cuda compile error --- paddle/platform/cublas.h | 3 -- paddle/platform/cuda.h | 9 ++++++ paddle/platform/curand.h | 5 ++- paddle/platform/device_context.h | 52 +++++++++++++++++-------------- paddle/platform/dynamic_loader.cc | 4 +-- 5 files changed, 43 insertions(+), 30 deletions(-) diff --git a/paddle/platform/cublas.h b/paddle/platform/cublas.h index 70c9713325..d60eb501e9 100644 --- a/paddle/platform/cublas.h +++ b/paddle/platform/cublas.h @@ -3,7 +3,6 @@ namespace paddle { namespace dyload { -namespace dynload { std::once_flag cublas_dso_flag; void *cublas_dso_handle = nullptr; @@ -67,8 +66,6 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) #undef DYNAMIC_LOAD_CUBLAS_V2_WRAP #undef CUBLAS_BLAS_ROUTINE_EACH -} /* namespace dynload */ - // clang-format on #ifndef PADDLE_TYPE_DOUBLE #define CUBLAS_GEAM dynload::cublasSgeam diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h index 8fe891f9ce..05290b0e1e 100644 --- a/paddle/platform/cuda.h +++ b/paddle/platform/cuda.h @@ -33,6 +33,15 @@ int GetDeviceCount(void) { throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed"); return count; } +int GetCurrentDeviceId(void) { + int device_id; + throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed"); + return device_id; +} + +void SetDeviceId(int device_id) { + throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed"); +} } // namespace platform } // namespace paddle diff --git a/paddle/platform/curand.h b/paddle/platform/curand.h index 692c024e6e..edff6526bd 100644 --- a/paddle/platform/curand.h +++ b/paddle/platform/curand.h @@ -3,6 +3,8 @@ namespace paddle { namespace dyload { +std::once_flag curand_dso_flag; +void *curand_dso_handle = nullptr; #ifdef PADDLE_USE_DSO #define DYNAMIC_LOAD_CURAND_WRAP(__name) \ struct DynLoad__##__name { \ @@ -31,7 +33,8 @@ namespace dyload { __macro(curandSetStream) \ __macro(curandSetPseudoRandomGeneratorSeed)\ __macro(curandGenerateUniform) \ - __macro(curandGenerateUniformDouble) + __macro(curandGenerateUniformDouble) \ + __macro(curandDestroyGenerator) // clang-format on CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index f95aac4a36..65e76666a7 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -83,11 +83,12 @@ class CudaDeviceContext : public DeviceContext { cublasHandle_t cublas_handle() { if (!blas_handle_) { DeviceGuard guard(gpu_place_); - PADDLE_ENFORCE(cublasCreate(&blas_handle_) == CUBLAS_STATUS_SUCCESS, - "cublasCreate failed"); PADDLE_ENFORCE( - cublasSetStream(blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS, - "cublasSetStream failed"); + paddle::dyload::cublasCreate(&blas_handle_) == CUBLAS_STATUS_SUCCESS, + "cublasCreate failed"); + PADDLE_ENFORCE(paddle::dyload::cublasSetStream(blas_handle_, stream_) == + CUBLAS_STATUS_SUCCESS, + "cublasSetStream failed"); } return blas_handle_; } @@ -95,11 +96,12 @@ class CudaDeviceContext : public DeviceContext { cudnnHandle_t cudnn_handle() { if (!dnn_handle_) { DeviceGuard guard(gpu_place_); - PADDLE_ENFORCE(cudnnCreate(&dnn_handle_) == CUDNN_STATUS_SUCCESS, - "cudnnCreate failed"); PADDLE_ENFORCE( - cudnnSetStream(dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS, - "cudnnSetStream failed"); + paddle::dyload::cudnnCreate(&dnn_handle_) == CUDNN_STATUS_SUCCESS, + "cudnnCreate failed"); + PADDLE_ENFORCE(paddle::dyload::cudnnSetStream(dnn_handle_, stream_) == + CUDNN_STATUS_SUCCESS, + "cudnnSetStream failed"); } return dnn_handle_; } @@ -107,17 +109,17 @@ class CudaDeviceContext : public DeviceContext { curandGenerator_t curand_generator() { if (!rand_generator_) { DeviceGuard guard(gpu_place_); + PADDLE_ENFORCE(paddle::dyload::curandCreateGenerator( + &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) == + CURAND_STATUS_SUCCESS, + "curandCreateGenerator failed"); PADDLE_ENFORCE( - curandCreateGenerator(&rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) == - CURAND_STATUS_SUCCESS, - "curandCreateGenerator failed"); - PADDLE_ENFORCE( - curandSetPseudoRandomGeneratorSeed(rand_generator_, random_seed_) == - CURAND_STATUS_SUCCESS, + paddle::dyload::curandSetPseudoRandomGeneratorSeed( + rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS, "curandSetPseudoRandomGeneratorSeed failed"); - PADDLE_ENFORCE( - curandSetStream(rand_generator_, stream_) == CURAND_STATUS_SUCCESS, - "curandSetStream failed"); + PADDLE_ENFORCE(paddle::dyload::curandSetStream( + rand_generator_, stream_) == CURAND_STATUS_SUCCESS, + "curandSetStream failed"); } return rand_generator_; } @@ -125,19 +127,21 @@ class CudaDeviceContext : public DeviceContext { ~CudaDeviceContext() { Wait(); if (blas_handle_) { - PADDLE_ENFORCE(cublasDestroy(blas_handle_) == CUBLAS_STATUS_SUCCESS, - "cublasDestroy failed"); + PADDLE_ENFORCE( + paddle::dyload::cublasDestroy(blas_handle_) == CUBLAS_STATUS_SUCCESS, + "cublasDestroy failed"); } if (dnn_handle_) { - PADDLE_ENFORCE(cudnnDestroy(dnn_handle_) == CUDNN_STATUS_SUCCESS, - "cudnnDestroy failed"); + PADDLE_ENFORCE( + paddle::dyload::cudnnDestroy(dnn_handle_) == CUDNN_STATUS_SUCCESS, + "cudnnDestroy failed"); } if (rand_generator_) { - PADDLE_ENFORCE( - curandDestroyGenerator(rand_generator_) == CURAND_STATUS_SUCCESS, - "curandDestroyGenerator failed"); + PADDLE_ENFORCE(paddle::dyload::curandDestroyGenerator(rand_generator_) == + CURAND_STATUS_SUCCESS, + "curandDestroyGenerator failed"); } delete eigen_stream_; diff --git a/paddle/platform/dynamic_loader.cc b/paddle/platform/dynamic_loader.cc index 9036eaf642..c34abc392c 100644 --- a/paddle/platform/dynamic_loader.cc +++ b/paddle/platform/dynamic_loader.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "dynamic_loader.h" #include -#include "DynamicLoader.h" -#include "Logging.h" +#include DEFINE_string(cudnn_dir, "", "Specify path for loading libcudnn.so. For instance, " From 3f63d96abec165426bcd464f7aff32e2e42ed021 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 3 Jul 2017 23:16:11 +0800 Subject: [PATCH 27/79] Fix link error in op_proto_test. --- paddle/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 50107faaed..f7e5753ac2 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -7,4 +7,4 @@ cc_test(scope_test SRCS scope_test.cc) cc_test(enforce_test SRCS enforce_test.cc) proto_library(attr_type SRCS attr_type.proto) proto_library(op_proto SRCS op_proto.proto) -cc_test(op_proto_test SRCS op_proto_test.cc DEPS attr_type op_proto protobuf) +cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf) From e12d7269ff473db5cc87de1344630eb348017a4a Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 4 Jul 2017 01:22:01 +0000 Subject: [PATCH 28/79] fix by helin's comments --- python/paddle/v2/reader/creator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 20624d5286..61b5cc134f 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -106,7 +106,7 @@ def recordio(paths, buf_size=100): while True: r, err = client.next_record() - if r is None: + if err < 0: break yield r From ed18647e37f4e345f02171f29af6e22fab4790ea Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 4 Jul 2017 11:00:59 +0800 Subject: [PATCH 29/79] finish test --- paddle/platform/CMakeLists.txt | 1 - paddle/platform/cuda.h | 1 + paddle/platform/device_context.h | 170 ------------------------- paddle/platform/device_context_test.cu | 29 ----- 4 files changed, 1 insertion(+), 200 deletions(-) delete mode 100644 paddle/platform/device_context.h delete mode 100644 paddle/platform/device_context_test.cu diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index c95b54a4df..ffdc23d599 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -4,4 +4,3 @@ cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_library(dynamic_loader SRCS dynamic_loader.cc) -nv_test(device_context_test SRCS device_context_test.cu DEPS place dynamic_loader glog gflags) diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h index 05290b0e1e..5ed36c0f02 100644 --- a/paddle/platform/cuda.h +++ b/paddle/platform/cuda.h @@ -33,6 +33,7 @@ int GetDeviceCount(void) { throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed"); return count; } + int GetCurrentDeviceId(void) { int device_id; throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed"); diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h deleted file mode 100644 index 65e76666a7..0000000000 --- a/paddle/platform/device_context.h +++ /dev/null @@ -1,170 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef PADDLE_ONLY_CPU -#include "paddle/platform/cublas.h" -#include "paddle/platform/cuda.h" -#include "paddle/platform/cudnn.h" -#include "paddle/platform/curand.h" -#define EIGEN_USE_GPU -#endif - -#include "paddle/framework/enforce.h" -#include "paddle/platform/place.h" -#include "unsupported/Eigen/CXX11/Tensor" - -namespace paddle { -namespace platform { - -class DeviceContext { - public: - virtual ~DeviceContext() {} -}; - -class CpuDeviceContext : public DeviceContext { - Eigen::DefaultDevice eigen_device() { - if (!eigen_device_) { - eigen_device_ = new Eigen::DefaultDevice(); - } - return *eigen_device_; - } - - private: - Eigen::DefaultDevice* eigen_device_{nullptr}; -}; - -#ifndef PADDLE_ONLY_CPU -class DeviceGuard { - public: - explicit DeviceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) { - if (previous_ != new_place) { - paddle::platform::SetDeviceId(new_place.device); - } - } - - ~DeviceGuard() { paddle::platform::SetDeviceId(previous_.device); } - - private: - GPUPlace previous_; -}; - -class CudaDeviceContext : public DeviceContext { - public: - explicit CudaDeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) { - DeviceGuard guard(gpu_place_); - paddle::platform::throw_on_error(cudaStreamCreate(&stream_), - "cudaStreamCreate failed"); - eigen_stream_ = new Eigen::CudaStreamDevice(&stream_); - eigen_device_ = new Eigen::GpuDevice(eigen_stream_); - } - - void Wait() { - paddle::platform::throw_on_error(cudaStreamSynchronize(stream_), - "cudaStreamSynchronize failed"); - } - - cudaStream_t stream() { return stream_; } - - Eigen::GpuDevice eigen_device() { return *eigen_device_; } - - cublasHandle_t cublas_handle() { - if (!blas_handle_) { - DeviceGuard guard(gpu_place_); - PADDLE_ENFORCE( - paddle::dyload::cublasCreate(&blas_handle_) == CUBLAS_STATUS_SUCCESS, - "cublasCreate failed"); - PADDLE_ENFORCE(paddle::dyload::cublasSetStream(blas_handle_, stream_) == - CUBLAS_STATUS_SUCCESS, - "cublasSetStream failed"); - } - return blas_handle_; - } - - cudnnHandle_t cudnn_handle() { - if (!dnn_handle_) { - DeviceGuard guard(gpu_place_); - PADDLE_ENFORCE( - paddle::dyload::cudnnCreate(&dnn_handle_) == CUDNN_STATUS_SUCCESS, - "cudnnCreate failed"); - PADDLE_ENFORCE(paddle::dyload::cudnnSetStream(dnn_handle_, stream_) == - CUDNN_STATUS_SUCCESS, - "cudnnSetStream failed"); - } - return dnn_handle_; - } - - curandGenerator_t curand_generator() { - if (!rand_generator_) { - DeviceGuard guard(gpu_place_); - PADDLE_ENFORCE(paddle::dyload::curandCreateGenerator( - &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) == - CURAND_STATUS_SUCCESS, - "curandCreateGenerator failed"); - PADDLE_ENFORCE( - paddle::dyload::curandSetPseudoRandomGeneratorSeed( - rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS, - "curandSetPseudoRandomGeneratorSeed failed"); - PADDLE_ENFORCE(paddle::dyload::curandSetStream( - rand_generator_, stream_) == CURAND_STATUS_SUCCESS, - "curandSetStream failed"); - } - return rand_generator_; - } - - ~CudaDeviceContext() { - Wait(); - if (blas_handle_) { - PADDLE_ENFORCE( - paddle::dyload::cublasDestroy(blas_handle_) == CUBLAS_STATUS_SUCCESS, - "cublasDestroy failed"); - } - - if (dnn_handle_) { - PADDLE_ENFORCE( - paddle::dyload::cudnnDestroy(dnn_handle_) == CUDNN_STATUS_SUCCESS, - "cudnnDestroy failed"); - } - - if (rand_generator_) { - PADDLE_ENFORCE(paddle::dyload::curandDestroyGenerator(rand_generator_) == - CURAND_STATUS_SUCCESS, - "curandDestroyGenerator failed"); - } - - delete eigen_stream_; - delete eigen_device_; - - paddle::platform::throw_on_error(cudaStreamDestroy(stream_), - "cudaStreamDestroy failed"); - } - - private: - GPUPlace gpu_place_; - cudaStream_t stream_; - - Eigen::CudaStreamDevice* eigen_stream_; - Eigen::GpuDevice* eigen_device_; - - cublasHandle_t blas_handle_{nullptr}; - - cudnnHandle_t dnn_handle_{nullptr}; - - int random_seed_; - curandGenerator_t rand_generator_{nullptr}; -}; -#endif -} // namespace platform -} // namespace paddle diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu deleted file mode 100644 index a15fb53b71..0000000000 --- a/paddle/platform/device_context_test.cu +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/platform/device_context.h" -#include "gtest/gtest.h" - - -TEST(DeviceContext, CudaDevice) { - int count = paddle::platform::GetDeviceCount(); - for (int i = 0; i < count; i++) { - paddle::platform::CudaDeviceContext* device_context = new paddle::platform::CudaDeviceContext(i); - __attribute__((unused)) Eigen::GpuDevice gpu_device = device_context->eigen_device(); - __attribute__((unused)) cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); - __attribute__((unused)) cublasHandle_t cublas_handle = device_context->cublas_handle(); - __attribute__((unused)) curandGenerator_t curand_handle = device_context->curand_generator(); - delete device_context; - } -} From 76b7be46da5fe211d25e62712673cc01bea98d54 Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 4 Jul 2017 11:16:49 +0800 Subject: [PATCH 30/79] add deps for dyload cc_library --- paddle/platform/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index ffdc23d599..4f6381b8af 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -3,4 +3,4 @@ nv_test(cuda_test SRCS cuda_test.cu) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) -cc_library(dynamic_loader SRCS dynamic_loader.cc) +cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) From 3b073fdc2be1c808db27519e01e3a61c07927959 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 4 Jul 2017 11:25:11 +0800 Subject: [PATCH 31/79] fix error in test_LayerGrad --- paddle/gserver/tests/test_LayerGrad.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index ed067e7c3a..d3c99eb8b9 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -873,7 +873,7 @@ TEST(Layer, SequenceLastInstanceLayer) { TEST(Layer, AverageLayer) { testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq testDegradeLayer(false, - "max", + "average", "non-seq", 5); // seq average to a shorten seq, stride window = 5 testDegradeLayer( From f535b79820ae97ade802053dc421a893460367c8 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 4 Jul 2017 12:05:52 +0800 Subject: [PATCH 32/79] sort the Author.md with Alphabetical order --- AUTHORS.md | 74 +++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index 09698ac140..4db4a4a8e7 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,48 +1,48 @@ | Github account | name | |---|---| +| backyes | Yan-Fei Wang | | beckett1124 | Bin Qi | -| Canpio | Jiayi Feng | -| chengxiaohua1105 | Xiaohua Cheng | -| xushaoyong | Shaoyong Xu | -| liuyuan | Yuan Liu | -| xujun05 | Jun Xu | -| dzhwinter | Zhihong Dong | -| Guo Sheng | Sheng Guo | -| kuke | Yibing Liu | -| llxxxll | YongFeng Liu | -| cxysteven | Xingyi Cheng | -| NHZlX | Zhaolong Xing | -| pakchoi | Chuanjiang Song | -| pkuyym | Yaming Yang | -| Superjom | Chunwei Yan | -| wanghaoshuang | Haoshuang Wang | -| wangzhen-nlp | Zhen Wang | -| wwhu | Weiwei Hu | -| xinghai-sun | XingHai Sun | -| zhaopu7 | Pu Zhao | -| reyoung | Yang Yu | +| Canpio | Jia-Yi Feng | +| chengxiaohua1105 | Xiao-Hua Cheng | +| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | +| cxysteven | Xing-Yi Cheng | +| dzhwinter | Zhi-Hong Dong | +| emailweixu | Wei Xu | | gangliao | Gang Liao | -| luotao01 | Tao Luo | -| jacquesqiao | Long-Fei Qiao | -| qingqing01 | Qing-Qing Dang | +| gongweibao | Wei-Bao Gong | +| Guo Sheng | Sheng Guo | +| Haichao-Zhang | Hai-Chao Zhang | | hedaoyuan | Dao-Yuan He | -| wangyang59 | Yang Wang | +| helinwang | He-Lin Wang | +| jacquesqiao | Long-Fei Qiao | +| kuke | Yi-Bing Liu | +| lcy-seso | Ying Cao | +| lipeng-unisound | Peng Li | +| liuyuan | Yuan Liu | +| livc | Zhao Li | +| llxxxll | Yong-Feng Liu | +| luotao01 | Tao Luo | +| lzhao4ever | Liang Zhao | +| NHZlX | Zhao-Long Xing | +| pakchoi | Chuan-Jiang Song | +| pengli09 | Peng Li | +| pkuyym | Ya-Ming Yang | | QiJune | Jun Qi | +| qingqing01 | Qing-Qing Dang | +| reyoung | Yang Yu | +| Superjom | Chun-Wei Yan | | tianbingsz | Tian-Bing Xu | -| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | typhoonzero | Yi Wu | -| backyes | Yan-Fei Wang | -| pengli09 | Peng Li | -| livc | Zhao Li | +| wanghaoshuang | Hao-Shuang Wang | +| wangyang59 | Yang Wang | +| wangzhen-nlp | Zhen Wang | +| wen-bo-yang | Wen-Bo Yang | +| wwhu | Wei-Wei Hu | +| xinghai-sun | Xing-Hai Sun | | Xreki | Yi-Qun Liu | +| xujun05 | Jun Xu | +| xushaoyong | Shao-Yong Xu | | Yancey1989 | Xu Yan | -| emailweixu | Wei Xu | -| wen-bo-yang | Wen-Bo Yang | -| helinwang | He-Lin Wang | -| lcy-seso | Ying Cao | -| Zrachel | Rui-Qing Zhang | -| Haichao-Zhang | Hai-Chao Zhang | -| gongweibao | Wei-Bao Gong | -| lzhao4ever | Liang Zhao | +| zhaopu7 | Pu Zhao | | zhouxiao-coder | Xiao Zhou | -| lipeng-unisound | Peng Li | +| Zrachel | Rui-Qing Zhang | From 06156daa281e55fe5d06217cc545cd8c09aa4c9d Mon Sep 17 00:00:00 2001 From: "Superjom (Chunwei Yan)" Date: Tue, 4 Jul 2017 12:07:16 +0800 Subject: [PATCH 33/79] net design with NetBuilder (#2598) * move net_design to framework * change CreateNet result to unique_ptr * rename "ScratchNet" -> "PlainNet" * add three methods to NetBase * add NetBuilder * add InferShape to NetBuilder.Run * rename ApplyGradient, ApplyOptimizer -> AddGradientOps, AddOptimiz * rename PlainNet::CreateNet -> BuildNet * add Error and other rename actions --- paddle/framework/net_design.md | 250 +++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 paddle/framework/net_design.md diff --git a/paddle/framework/net_design.md b/paddle/framework/net_design.md new file mode 100644 index 0000000000..a5f0483081 --- /dev/null +++ b/paddle/framework/net_design.md @@ -0,0 +1,250 @@ +# Network Design + +`Network` is the container and controller of a set of operators, +user can build a real network from a `NetDesc` which is a protobuf message +and use `Network.Run()` to run all the operators in the network. + +A network object knows all Operators belonging to this network. Variables, +which are inputs and outputs of these operators, +are created and managed by a hierarchy of Scope objects. + +# API + +## Net +To make the `Network` extendable, a base class is defined like this + +```c++ +// operator's index stored in a network. +typedef int OpIndex; + +// The minimum a network should be implemented. +class Net { + public: + // run all the operators and return success(true) or not, with all the + // variables are located in `scope`. `context` describes the detail execution + // environment for ops. `begin` and `end` specify the scope of `ops_` to run, + // If no positive indexes are provided, all operators in `ops_` will run. + virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1, + OpIndex end = -1) const = 0; + + // Add an Operator according to `def`. + virtual OpIndex AddOp(const proto::OpDef &def) = 0; + + // Add optimizer operators acctording to `attrs`. + virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0; + + // Add backward operators. + virtual Error AddBackwardOps() = 0; + + // Infer the shapes of variables required by operators in the network. The + // `scope` will be mutated according to the inferred shapes. + + static std::unique_ptr Create(const NetDesc &def = NetDesc()); +}; +``` + +All network implementations should build networks from a protobuf message which +describes the structure of a real network; `Run` method should be implemented by +all implementations to offer a universal method to forward or backward compute a network. + +`Net::Create` is a method of factory pattern and can be implemented like + +```c++ +std::unique Net::Create(const NetDesc& def) { + switch (def.model_type()) { + case NN: + return new Network(def); + case Recursive: + return new RecursiveNet(def); + case Recurrent: + return new RecurrentNet(def); + } + return nullptr; +} +``` + +Network is designed as the container of operators. to make it more extendable, +we decouple it from the related variable resources. + +`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes. + +Finally, `Net` can be used as followed + +```c++ +Scope default_scope; +OpContext default_context; +auto net = Net::CreateNet(def); + +if (net) { + net.Run(&default_scope, &default_context); +} +``` + +## `PlainNet` as a simple implementation of `BaseNet` + +A very basic implementation is as follows. All it does is simply to run every operators in sequence. + +```c++ +class PlainNet : public Net { + public: + // Create a network describe by `def`. NetDesc is the definition of a network. + PlainNet(const NetDesc &def); + + // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch + training. + virtual Error InferShape(Scope *scope) override; + + // Run all the operators with the `scope`, if no scope is provided, default + // scope will be used instead. If no OpContext is provicded, default context will be used. + virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1, + OpIndex end = -1) const override; + + virtual OpIndex AddOp(const proto::OpDef &def) override; + + virtual Error AddOptimizerOps(const OptAttrs &attrs) override; + + virtual Error AddBackwardOps() override; + + protected: + // Create operators accordding to `def`, will be called by the constructor. + Error BuildNet(const NetDesc &def); + + // Add a operator which is identified as `type` and has attributes described + // in `attrs`, the `inputs` are the keys of readonly input variables, + // `outputs` are keys of mutable output variables. An `OpIndex` will be + // returned to indicate the offset of the new operator in `ops_`. + OpIndex AddOp(const std::string &type, const std::vector &inputs, + const std::vector &outputs, + const OprAttr &attrs = OprAttr()); + + private: + // the operators owned by `Network`. + std::vector ops_; +}; +``` + +`PlainNet` will create operators so that a private member `ops_` is defined, +the operators are created by `CreateNet`, and each operator is created by `AddOp`. + + +## PlainNet Usage +`PlainNet` can be used to define and run a network as follows + +```c++ +// create an empty scope located on CPU device. +Scope scope(CPUPlace()); + +// create and init variables described in `net_desc`. +scope.CreateVariables(net_desc); +scope.InitVariables(net_desc); + +// create a network according to `net_desc` +auto net = Net::CreateNet(net_desc); +// Add more operators if needed. +net->AddOp(add...); +net->AddOp(fc...); + +net->AddBackwardOps(); +net->AddOptimizerOps(); + +// run the network providing the `scope`. +net.Run(&scope); +``` + +## `NetBuilder` as a C++ syntax wrapper +This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage. + +The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces. + +```c++ +Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid"); +Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid"); +Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label); +Variable* avg_loss = builder.AddOp("mean", loss); + +builder.BackwardFrom(avg_loss) +builder.AddOptimization(1e-4, "adam"); +builder.Run(); +``` + +`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition + +```c++ +class NetBuilder final { + public: + NetBuilder(Net* net) : net_(net) {} + + Variable* AddOp(const string& type, const vector& inputs, + size_t size, Activation act) { + // much code here. + // ... + net_->AddOp(def); + need_rebuild_net_ = true; + net_->InferShape(); + // ... + } + + Error BackwardFrom(const Variable& cost); + + Error Run(Scope* scope, OpContext* context, bool need_backward = true) { + // backward. + if (need_backward) { + if (need_rebuild_net_) { + AddBackwardOps(); + AddOptimizerOps(); + } + net_->Run(scope, context); + return; + } + // just forward. + net_->Run(scope, context, 0, last_forward_op_); + } + + protected: + Error AddBackwardOps(); + Error AddOptimizerOps(); + + private: + Net* net_; + OpIndex last_forward_op_{-1}; + bool need_rebuild_net_{true}; +} +``` + +## Compatibility with RNN + +Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, +for example we can implement a simple recurrent neural network as follows + +```c++ +// copy some `vars` form `source` to `target` +void Copy(const Scope &source, Scope &target, + const std::vector &vars); + +Scope default_scope; +// some initial mutations on `default_scope` here. + +auto rnn_step_net = PlainNet(rnn_step_net_def); + +// Create rnn's states, the last scope is used to store rnn outputs. +Scope *rnn_states = new Scope[num_states + 1]; + +for (int i = 0; i < num_states + 1; i++) { + // Initialize all rnn state scopes, copy parameters and so on. + rnn_states[i].CreateVars(rnn_step_net_def); + Copy(default_scope, rnn_states[i], rnn_related_vars); + // Prepare rnn's inlinks, just copy inlink variables to each state. + Copy(default_scope, rnn_states[i], inlink_vars); +} + +// Run the rnn. +for (int i = 0; i < num_states; i++) { + rnn_step_net.Run(rnn_states[i]); + // Copy current state's state variables to next state, the related variables + // are named like "previous_state_xxx". + Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars) +} + +// Copy rnn's final outputs to `default_scope`. +Copy(rnn_states[num_states], default_scope, outlink_vars); +``` From 3de3894b821c06daf596c0818b6c89f4dd185928 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 4 Jul 2017 12:53:00 +0800 Subject: [PATCH 34/79] Add DEPS to `proto_library` Missing DEPS will cause compile error when parallel is large. --- cmake/generic.cmake | 4 ++-- paddle/framework/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index ca358da8f1..fb2222440c 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -323,10 +323,10 @@ endfunction(go_test) function(proto_library TARGET_NAME) set(oneValueArgs "") - set(multiValueArgs SRCS) + set(multiValueArgs SRCS DEPS) cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(proto_srcs) set(proto_hdrs) protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) - cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS protobuf) + cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) endfunction() diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index f7e5753ac2..e781866759 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -6,5 +6,5 @@ cc_test(variable_test SRCS variable_test.cc) cc_test(scope_test SRCS scope_test.cc) cc_test(enforce_test SRCS enforce_test.cc) proto_library(attr_type SRCS attr_type.proto) -proto_library(op_proto SRCS op_proto.proto) +proto_library(op_proto SRCS op_proto.proto DEPS attr_type) cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf) From b8cc07920e3cf623250ea0b9b078049ff1348279 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 13:20:55 +0800 Subject: [PATCH 35/79] FIX: add eigen3 interface deps --- cmake/external/eigen.cmake | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 45f44f617d..39b16c3b2b 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -5,7 +5,7 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3) ExternalProject_Add( - eigen3 + extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} # for latest version, please get from official website # URL "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz" @@ -26,4 +26,7 @@ ExternalProject_Add( TEST_COMMAND "" ) -LIST(APPEND external_project_dependencies eigen3) +ADD_LIBRARY(eigen3 INTERFACE) +ADD_DEPENDENCIES(eigen3 extern_eigen3) + +LIST(APPEND external_project_dependencies extern_eigen3) From 414c2b1734bcf39135935a201f7244d79a72e172 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 13:25:30 +0800 Subject: [PATCH 36/79] FIX: add any as interface dep --- cmake/external/any.cmake | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake index 62eea42692..b61e421871 100644 --- a/cmake/external/any.cmake +++ b/cmake/external/any.cmake @@ -5,7 +5,7 @@ SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any) INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any) ExternalProject_Add( - linb_any + extern_lib_any ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/thelink2012/any.git" GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" @@ -17,5 +17,8 @@ ExternalProject_Add( TEST_COMMAND "" ) +ADD_LIBRARY(lib_any INTERFACE) +ADD_DEPENDENCIES(lib_any extern_lib_any) + add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE) -LIST(APPEND external_project_dependencies linb_any) \ No newline at end of file +LIST(APPEND external_project_dependencies extern_lib_any) From b7397031e9c03363c4e3e3119ff371b0d8a13e7c Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 4 Jul 2017 13:47:02 +0800 Subject: [PATCH 37/79] Add target_link_libraries for cc_library It will fix #2728. Maybe it is silly to `target_link_libraries` for static library, because a static library do not need to link other libraries. But it will tell cmake how to propagate dependencies. The solution comes from [here](http://floooh.github.io/2016/01/12/cmake-dependency-juggling.html). * Also change op_proto_test DEPS for testing this fix works. --- cmake/generic.cmake | 2 ++ paddle/framework/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index fb2222440c..cae9524b2f 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -162,6 +162,7 @@ function(cc_library TARGET_NAME) endif() if (cc_library_DEPS) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) endif() else(cc_library_SRCS) if (cc_library_DEPS) @@ -211,6 +212,7 @@ function(nv_library TARGET_NAME) endif() if (nv_library_DEPS) add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) endif() else(nv_library_SRCS) if (nv_library_DEPS) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index e781866759..baad38e3c1 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -7,4 +7,4 @@ cc_test(scope_test SRCS scope_test.cc) cc_test(enforce_test SRCS enforce_test.cc) proto_library(attr_type SRCS attr_type.proto) proto_library(op_proto SRCS op_proto.proto DEPS attr_type) -cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf) +cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf) From 9eeabe986d039b3fe3b28e5ef98f66d6dd2a3e31 Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 4 Jul 2017 14:03:58 +0800 Subject: [PATCH 38/79] follow comments --- paddle/platform/cublas.h | 58 +++++++++++++++++++++---------- paddle/platform/cudnn.h | 38 +++++++++++++++----- paddle/platform/curand.h | 40 +++++++++++++++------ paddle/platform/dynamic_loader.cc | 16 +++++++-- paddle/platform/dynamic_loader.h | 14 ++++---- 5 files changed, 119 insertions(+), 47 deletions(-) diff --git a/paddle/platform/cublas.h b/paddle/platform/cublas.h index d60eb501e9..90704f37e6 100644 --- a/paddle/platform/cublas.h +++ b/paddle/platform/cublas.h @@ -1,7 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + #include #include "paddle/platform/dynamic_loader.h" namespace paddle { +namespace platform { namespace dyload { std::once_flag cublas_dso_flag; @@ -15,15 +32,17 @@ void *cublas_dso_handle = nullptr; * note: default dynamic linked libs */ #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - cublasStatus_t operator()(Args... args) { \ - typedef cublasStatus_t (*cublasFunc)(Args...); \ - std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \ - void *p_##__name = dlsym(cublas_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ +#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cublasStatus_t operator()(Args... args) { \ + typedef cublasStatus_t (*cublasFunc)(Args...); \ + std::call_once(cublas_dso_flag, \ + paddle::platform::dyload::GetCublasDsoHandle, \ + &cublas_dso_handle); \ + void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ } __name; // struct DynLoad__##__name #else #define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ @@ -68,17 +87,18 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) // clang-format on #ifndef PADDLE_TYPE_DOUBLE -#define CUBLAS_GEAM dynload::cublasSgeam -#define CUBLAS_GEMV dynload::cublasSgemv -#define CUBLAS_GEMM dynload::cublasSgemm -#define CUBLAS_GETRF dynload::cublasSgetrfBatched -#define CUBLAS_GETRI dynload::cublasSgetriBatched +#define CUBLAS_GEAM paddle::platform::dynload::cublasSgeam +#define CUBLAS_GEMV paddle::platform::dynload::cublasSgemv +#define CUBLAS_GEMM paddle::platform::dynload::cublasSgemm +#define CUBLAS_GETRF paddle::platform::dynload::cublasSgetrfBatched +#define CUBLAS_GETRI paddle::platform::dynload::cublasSgetriBatched #else -#define CUBLAS_GEAM dynload::cublasDgeam -#define CUBLAS_GEMV dynload::cublasDgemv -#define CUBLAS_GEMM dynload::cublasDgemm -#define CUBLAS_GETRF dynload::cublasDgetrfBatched -#define CUBLAS_GETRI dynload::cublasDgetriBatched +#define CUBLAS_GEAM paddle::platform::dynload::cublasDgeam +#define CUBLAS_GEMV paddle::platform::dynload::cublasDgemv +#define CUBLAS_GEMM paddle::platform::dynload::cublasDgemm +#define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched +#define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched #endif } // namespace dyload +} // namespace platform } // namespace paddle diff --git a/paddle/platform/cudnn.h b/paddle/platform/cudnn.h index ab878cd555..06e2a05d86 100644 --- a/paddle/platform/cudnn.h +++ b/paddle/platform/cudnn.h @@ -1,7 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + #include #include "paddle/platform/dynamic_loader.h" namespace paddle { +namespace platform { namespace dyload { std::once_flag cudnn_dso_flag; @@ -9,15 +26,17 @@ void* cudnn_dso_handle = nullptr; #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using cudnn_func = decltype(__name(args...)) (*)(Args...); \ - std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \ - void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ +#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using cudnn_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(cudnn_dso_flag, \ + paddle::platform::dyload::GetCudnnDsoHandle, \ + &cudnn_dso_handle); \ + void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ } __name; /* struct DynLoad__##__name */ #else @@ -111,4 +130,5 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP) #undef CUDNN_DNN_ROUTINE_EACH // clang-format on } // namespace dyload +} // namespace platform } // namespace paddle diff --git a/paddle/platform/curand.h b/paddle/platform/curand.h index edff6526bd..a9cbe48ef8 100644 --- a/paddle/platform/curand.h +++ b/paddle/platform/curand.h @@ -1,20 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + #include #include "paddle/platform/dynamic_loader.h" namespace paddle { +namespace platform { namespace dyload { std::once_flag curand_dso_flag; void *curand_dso_handle = nullptr; #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - curandStatus_t operator()(Args... args) { \ - typedef curandStatus_t (*curandFunc)(Args...); \ - std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \ - void *p_##__name = dlsym(curand_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ +#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + typedef curandStatus_t (*curandFunc)(Args...); \ + std::call_once(curand_dso_flag, \ + paddle::platform::dyload::GetCurandDsoHandle, \ + &curand_dso_handle); \ + void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ } __name; /* struct DynLoad__##__name */ #else #define DYNAMIC_LOAD_CURAND_WRAP(__name) \ @@ -41,5 +60,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) #undef CURAND_RAND_ROUTINE_EACH #undef DYNAMIC_LOAD_CURAND_WRAP -} +} // namespace dyload +} // namespace platform } // namespace paddle diff --git a/paddle/platform/dynamic_loader.cc b/paddle/platform/dynamic_loader.cc index c34abc392c..9e0aadf8e2 100644 --- a/paddle/platform/dynamic_loader.cc +++ b/paddle/platform/dynamic_loader.cc @@ -13,8 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "dynamic_loader.h" -#include -#include +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" DEFINE_string(cudnn_dir, "", "Specify path for loading libcudnn.so. For instance, " @@ -30,6 +34,10 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); +namespace paddle { +namespace platform { +namespace dyload { + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -155,3 +163,7 @@ void GetLapackDsoHandle(void** dso_handle) { GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle); #endif } + +} // namespace dyload +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/dynamic_loader.h b/paddle/platform/dynamic_loader.h index 9b5ad21724..bb58fcba17 100644 --- a/paddle/platform/dynamic_loader.h +++ b/paddle/platform/dynamic_loader.h @@ -12,13 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef DYNAMIC_LOAD_H_ -#define DYNAMIC_LOAD_H_ +#pragma once -#include -#include -#include -#include +namespace paddle { +namespace platform { +namespace dyload { /** * @brief load the DSO of CUBLAS @@ -60,4 +58,6 @@ void GetWarpCTCDsoHandle(void** dso_handle); */ void GetLapackDsoHandle(void** dso_handle); -#endif // DYNAMIC_LOAD_H_ +} // namespace dyload +} // namespace platform +} // namespace paddle From 3567ea6d7c85bdfdd42f57f6db98e5bc9bbac5e2 Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 4 Jul 2017 14:58:20 +0800 Subject: [PATCH 39/79] move to dynload directory --- paddle/platform/CMakeLists.txt | 4 +-- paddle/platform/dynload/CMakeLists.txt | 1 + paddle/platform/{ => dynload}/cublas.h | 26 +++++++++---------- paddle/platform/{ => dynload}/cudnn.h | 26 +++++++++---------- paddle/platform/{ => dynload}/curand.h | 26 +++++++++---------- .../platform/{ => dynload}/dynamic_loader.cc | 4 +-- .../platform/{ => dynload}/dynamic_loader.h | 4 +-- 7 files changed, 46 insertions(+), 45 deletions(-) create mode 100644 paddle/platform/dynload/CMakeLists.txt rename paddle/platform/{ => dynload}/cublas.h (95%) rename paddle/platform/{ => dynload}/cudnn.h (97%) rename paddle/platform/{ => dynload}/curand.h (93%) rename paddle/platform/{ => dynload}/dynamic_loader.cc (99%) rename paddle/platform/{ => dynload}/dynamic_loader.h (96%) diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 4f6381b8af..cc6b52e927 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,6 +1,6 @@ +add_subdirectory(dynload) + nv_test(cuda_test SRCS cuda_test.cu) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) - -cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt new file mode 100644 index 0000000000..9f829b7012 --- /dev/null +++ b/paddle/platform/dynload/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) diff --git a/paddle/platform/cublas.h b/paddle/platform/dynload/cublas.h similarity index 95% rename from paddle/platform/cublas.h rename to paddle/platform/dynload/cublas.h index 90704f37e6..c9150ac573 100644 --- a/paddle/platform/cublas.h +++ b/paddle/platform/dynload/cublas.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace platform { -namespace dyload { +namespace dynload { std::once_flag cublas_dso_flag; void *cublas_dso_handle = nullptr; @@ -32,17 +32,17 @@ void *cublas_dso_handle = nullptr; * note: default dynamic linked libs */ #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - cublasStatus_t operator()(Args... args) { \ - typedef cublasStatus_t (*cublasFunc)(Args...); \ - std::call_once(cublas_dso_flag, \ - paddle::platform::dyload::GetCublasDsoHandle, \ - &cublas_dso_handle); \ - void *p_##__name = dlsym(cublas_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ +#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cublasStatus_t operator()(Args... args) { \ + typedef cublasStatus_t (*cublasFunc)(Args...); \ + std::call_once(cublas_dso_flag, \ + paddle::platform::dynload::GetCublasDsoHandle, \ + &cublas_dso_handle); \ + void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ } __name; // struct DynLoad__##__name #else #define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ @@ -99,6 +99,6 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) #define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched #define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched #endif -} // namespace dyload +} // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/platform/cudnn.h b/paddle/platform/dynload/cudnn.h similarity index 97% rename from paddle/platform/cudnn.h rename to paddle/platform/dynload/cudnn.h index 06e2a05d86..c03424b375 100644 --- a/paddle/platform/cudnn.h +++ b/paddle/platform/dynload/cudnn.h @@ -19,24 +19,24 @@ limitations under the License. */ namespace paddle { namespace platform { -namespace dyload { +namespace dynload { std::once_flag cudnn_dso_flag; void* cudnn_dso_handle = nullptr; #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using cudnn_func = decltype(__name(args...)) (*)(Args...); \ - std::call_once(cudnn_dso_flag, \ - paddle::platform::dyload::GetCudnnDsoHandle, \ - &cudnn_dso_handle); \ - void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ +#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using cudnn_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(cudnn_dso_flag, \ + paddle::platform::dynload::GetCudnnDsoHandle, \ + &cudnn_dso_handle); \ + void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ } __name; /* struct DynLoad__##__name */ #else @@ -129,6 +129,6 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP) #undef CUDNN_DNN_ROUTINE_EACH // clang-format on -} // namespace dyload +} // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/platform/curand.h b/paddle/platform/dynload/curand.h similarity index 93% rename from paddle/platform/curand.h rename to paddle/platform/dynload/curand.h index a9cbe48ef8..1ef7a8c833 100644 --- a/paddle/platform/curand.h +++ b/paddle/platform/dynload/curand.h @@ -19,21 +19,21 @@ limitations under the License. */ namespace paddle { namespace platform { -namespace dyload { +namespace dynload { std::once_flag curand_dso_flag; void *curand_dso_handle = nullptr; #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - curandStatus_t operator()(Args... args) { \ - typedef curandStatus_t (*curandFunc)(Args...); \ - std::call_once(curand_dso_flag, \ - paddle::platform::dyload::GetCurandDsoHandle, \ - &curand_dso_handle); \ - void *p_##__name = dlsym(curand_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ +#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + typedef curandStatus_t (*curandFunc)(Args...); \ + std::call_once(curand_dso_flag, \ + paddle::platform::dynload::GetCurandDsoHandle, \ + &curand_dso_handle); \ + void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ } __name; /* struct DynLoad__##__name */ #else #define DYNAMIC_LOAD_CURAND_WRAP(__name) \ @@ -60,6 +60,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) #undef CURAND_RAND_ROUTINE_EACH #undef DYNAMIC_LOAD_CURAND_WRAP -} // namespace dyload +} // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/platform/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc similarity index 99% rename from paddle/platform/dynamic_loader.cc rename to paddle/platform/dynload/dynamic_loader.cc index 9e0aadf8e2..8ef67bad8c 100644 --- a/paddle/platform/dynamic_loader.cc +++ b/paddle/platform/dynload/dynamic_loader.cc @@ -36,7 +36,7 @@ DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); namespace paddle { namespace platform { -namespace dyload { +namespace dynload { static inline std::string join(const std::string& part1, const std::string& part2) { @@ -164,6 +164,6 @@ void GetLapackDsoHandle(void** dso_handle) { #endif } -} // namespace dyload +} // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/platform/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h similarity index 96% rename from paddle/platform/dynamic_loader.h rename to paddle/platform/dynload/dynamic_loader.h index bb58fcba17..a99b05443f 100644 --- a/paddle/platform/dynamic_loader.h +++ b/paddle/platform/dynload/dynamic_loader.h @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace platform { -namespace dyload { +namespace dynload { /** * @brief load the DSO of CUBLAS @@ -58,6 +58,6 @@ void GetWarpCTCDsoHandle(void** dso_handle); */ void GetLapackDsoHandle(void** dso_handle); -} // namespace dyload +} // namespace dynload } // namespace platform } // namespace paddle From a211374d53090733667f2be2cf629cf858757c6d Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 15:01:05 +0800 Subject: [PATCH 40/79] FIX: interface deps under cmake < 3.3 --- cmake/external/any.cmake | 13 ++++++++++--- cmake/external/eigen.cmake | 13 ++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake index b61e421871..edf6edc0bd 100644 --- a/cmake/external/any.cmake +++ b/cmake/external/any.cmake @@ -17,8 +17,15 @@ ExternalProject_Add( TEST_COMMAND "" ) -ADD_LIBRARY(lib_any INTERFACE) -ADD_DEPENDENCIES(lib_any extern_lib_any) +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") + add_library(lib_any STATIC ${dummyfile}) +else() + add_library(lib_any INTERFACE) +endif() + +add_dependencies(lib_any extern_lib_any) add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE) -LIST(APPEND external_project_dependencies extern_lib_any) +LIST(APPEND external_project_dependencies lib_any) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 39b16c3b2b..1f2fdcac65 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -26,7 +26,14 @@ ExternalProject_Add( TEST_COMMAND "" ) -ADD_LIBRARY(eigen3 INTERFACE) -ADD_DEPENDENCIES(eigen3 extern_eigen3) +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";") + add_library(eigen3 STATIC ${dummyfile}) +else() + add_library(eigen3 INTERFACE) +endif() -LIST(APPEND external_project_dependencies extern_eigen3) +add_dependencies(eigen3 extern_eigen3) + +LIST(APPEND external_project_dependencies eigen3) From 817f317bef82eb2c024927e6a62b048a1ba93d4a Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 15:39:08 +0800 Subject: [PATCH 41/79] FIX: INTERFACE path --- cmake/external/any.cmake | 2 +- cmake/external/eigen.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake index edf6edc0bd..45e3764e84 100644 --- a/cmake/external/any.cmake +++ b/cmake/external/any.cmake @@ -2,7 +2,7 @@ INCLUDE(ExternalProject) SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any) -INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any) +INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ExternalProject_Add( extern_lib_any diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 1f2fdcac65..3e6cedbb0d 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -2,7 +2,7 @@ INCLUDE(ExternalProject) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) -INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3) +INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3) ExternalProject_Add( extern_eigen3 From 9045063b535c400ff8ebf20d0b8534103ec6d9ab Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 4 Jul 2017 15:58:15 +0800 Subject: [PATCH 42/79] pserver etcd client (#2559) * init etcd cclient * add etcd * add etcd.go * fix compile problem * move code to etcd.go * add etcd_lister.go for pserver client * add etcd_client_test.go * merge etcd_client_test and client_test * refine client_test.go * refine code * format code * add TODO and use interface instead of struct * fix typo of initDesiredPservers * optimize dir structure of go/pserver/client * add a flag to config index for pserver * follow comment * fix path * optimize code * remove err in pserver NewEtcd * restore comment about /ps_desired --- CMakeLists.txt | 2 +- go/CMakeLists.txt | 2 +- go/cmd/pserver/pserver.go | 16 ++- go/master/etcd_client.go | 4 +- .../{cclient => client/c}/CMakeLists.txt | 2 +- go/pserver/{cclient => client/c}/cclient.go | 26 ++-- .../{cclient => client/c}/test/CMakeLists.txt | 0 .../{cclient => client/c}/test/test_cclient.c | 0 .../{cclient => client/c}/test/test_mnist.py | 0 .../{cclient => client/c}/test/test_train.py | 0 .../c}/test/testdata/optimizer.pb | Bin go/pserver/{ => client}/client.go | 17 +-- go/pserver/{ => client}/client_test.go | 77 +++++++++-- go/pserver/client/etcd_client.go | 125 ++++++++++++++++++ go/pserver/etcd_client.go | 13 +- go/pserver/optimizer.go | 2 +- go/pserver/optimizer_test.go | 2 +- go/pserver/service.go | 3 - go/pserver/service_test.go | 8 +- 19 files changed, 246 insertions(+), 53 deletions(-) rename go/pserver/{cclient => client/c}/CMakeLists.txt (67%) rename go/pserver/{cclient => client/c}/cclient.go (88%) rename go/pserver/{cclient => client/c}/test/CMakeLists.txt (100%) rename go/pserver/{cclient => client/c}/test/test_cclient.c (100%) rename go/pserver/{cclient => client/c}/test/test_mnist.py (100%) rename go/pserver/{cclient => client/c}/test/test_train.py (100%) rename go/pserver/{cclient => client/c}/test/testdata/optimizer.pb (100%) rename go/pserver/{ => client}/client.go (92%) rename go/pserver/{ => client}/client_test.go (54%) create mode 100644 go/pserver/client/etcd_client.go diff --git a/CMakeLists.txt b/CMakeLists.txt index 5349f59805..5bedbbefa8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,7 +113,7 @@ include(coveralls) # set code coverage include_directories("${PROJ_ROOT}") include_directories("${PROJ_ROOT}/paddle/cuda/include") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") -include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient") +include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") include_directories(${Boost_INCLUDE_DIRS}) set(EXTERNAL_LIBS diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt index 014697d155..f00c70a058 100644 --- a/go/CMakeLists.txt +++ b/go/CMakeLists.txt @@ -13,7 +13,7 @@ # limitations under the License. # -add_subdirectory(pserver/cclient) +add_subdirectory(pserver/client/c) add_subdirectory(cmd/pserver) add_subdirectory(cmd/master) add_subdirectory(master/c) diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index 8a42d4f8af..31ef450f03 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -15,6 +15,7 @@ import ( func main() { port := flag.Int("port", 0, "port of the pserver") + index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0") etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", "comma separated endpoint string for pserver to connect to etcd") etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls") @@ -29,11 +30,16 @@ func main() { } log.SetLevel(level) - timeout := time.Second * time.Duration((*etcdTimeout)) - e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout) - idx, err := e.Register() - if err != nil { - panic(err) + var idx int + if *index >= 0 { + idx = *index + } else { + timeout := time.Second * time.Duration((*etcdTimeout)) + e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout) + idx, err = e.Register() + if err != nil { + panic(err) + } } s, err := pserver.NewService(idx) diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go index e27c014792..04c1394e96 100644 --- a/go/master/etcd_client.go +++ b/go/master/etcd_client.go @@ -50,7 +50,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat lock := concurrency.NewMutex(sess, lockPath) // It's fine for the lock to get stuck, in this case we have // multiple master servers running (only configured to have - // one master running, but split-brain problem may cuase + // one master running, but split-brain problem may cause // multiple master servers running), and the cluster management // software will kill one of them. log.Debugf("Trying to acquire lock at %s.", lockPath) @@ -98,7 +98,7 @@ func (e *EtcdClient) Save(state []byte) error { // We lost the master lock and can not acquire // it back, it means some other master is // already started. We don't want cluster - // managment system to kill the master server + // management system to kill the master server // who is holding the lock and running // correctly. So the most feasible solution is // to kill current master server. The current diff --git a/go/pserver/cclient/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt similarity index 67% rename from go/pserver/cclient/CMakeLists.txt rename to go/pserver/client/c/CMakeLists.txt index 7fe74c62f1..a3fcaeef19 100644 --- a/go/pserver/cclient/CMakeLists.txt +++ b/go/pserver/client/c/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf) -go_library(paddle_pserver_cclient STATIC) +go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer) if(WITH_TESTING) add_subdirectory(test) endif() diff --git a/go/pserver/cclient/cclient.go b/go/pserver/client/c/cclient.go similarity index 88% rename from go/pserver/cclient/cclient.go rename to go/pserver/client/c/cclient.go index bbaf43d9f1..7ddaceb7ed 100644 --- a/go/pserver/cclient/cclient.go +++ b/go/pserver/client/c/cclient.go @@ -30,15 +30,16 @@ import ( "unsafe" "github.com/PaddlePaddle/Paddle/go/pserver" + "github.com/PaddlePaddle/Paddle/go/pserver/client" log "github.com/sirupsen/logrus" ) var nullPtr = unsafe.Pointer(uintptr(0)) var mu sync.Mutex -var handleMap = make(map[C.paddle_pserver_client]*pserver.Client) +var handleMap = make(map[C.paddle_pserver_client]*client.Client) var curHandle C.paddle_pserver_client -func add(c *pserver.Client) C.paddle_pserver_client { +func add(c *client.Client) C.paddle_pserver_client { mu.Lock() defer mu.Unlock() client := curHandle @@ -47,13 +48,13 @@ func add(c *pserver.Client) C.paddle_pserver_client { return client } -func get(client C.paddle_pserver_client) *pserver.Client { +func get(client C.paddle_pserver_client) *client.Client { mu.Lock() defer mu.Unlock() return handleMap[client] } -func remove(client C.paddle_pserver_client) *pserver.Client { +func remove(client C.paddle_pserver_client) *client.Client { mu.Lock() defer mu.Unlock() h := handleMap[client] @@ -80,9 +81,9 @@ func (s selector) Select() bool { return bool(s) } -type lister []pserver.Server +type lister []client.Server -func (l lister) List() []pserver.Server { +func (l lister) List() []client.Server { return l } @@ -90,19 +91,22 @@ func (l lister) List() []pserver.Server { func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client { a := C.GoString(addrs) as := strings.Split(a, ",") - servers := make([]pserver.Server, len(as)) + servers := make([]client.Server, len(as)) for i := range as { servers[i].Index = i servers[i].Addr = as[i] } - c := pserver.NewClient(lister(servers), len(as), selector(selected != 0)) + c := client.NewClient(lister(servers), len(as), selector(selected != 0)) return add(c) } //export paddle_new_etcd_pserver_client -func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client { - // TODO(helin): fault tolerant pserver client using etcd. - panic("not implemented.") +func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client { + // TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters) + addr := C.GoString(etcd_endpoints) + etcd_client := client.NewEtcd(addr) + c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0)) + return add(c) } //export paddle_pserver_client_release diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt similarity index 100% rename from go/pserver/cclient/test/CMakeLists.txt rename to go/pserver/client/c/test/CMakeLists.txt diff --git a/go/pserver/cclient/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c similarity index 100% rename from go/pserver/cclient/test/test_cclient.c rename to go/pserver/client/c/test/test_cclient.c diff --git a/go/pserver/cclient/test/test_mnist.py b/go/pserver/client/c/test/test_mnist.py similarity index 100% rename from go/pserver/cclient/test/test_mnist.py rename to go/pserver/client/c/test/test_mnist.py diff --git a/go/pserver/cclient/test/test_train.py b/go/pserver/client/c/test/test_train.py similarity index 100% rename from go/pserver/cclient/test/test_train.py rename to go/pserver/client/c/test/test_train.py diff --git a/go/pserver/cclient/test/testdata/optimizer.pb b/go/pserver/client/c/test/testdata/optimizer.pb similarity index 100% rename from go/pserver/cclient/test/testdata/optimizer.pb rename to go/pserver/client/c/test/testdata/optimizer.pb diff --git a/go/pserver/client.go b/go/pserver/client/client.go similarity index 92% rename from go/pserver/client.go rename to go/pserver/client/client.go index 6938b9d5ce..aa8bfe30c2 100644 --- a/go/pserver/client.go +++ b/go/pserver/client/client.go @@ -1,4 +1,4 @@ -package pserver +package client import ( "errors" @@ -7,6 +7,7 @@ import ( "time" "github.com/PaddlePaddle/Paddle/go/connection" + "github.com/PaddlePaddle/Paddle/go/pserver" log "github.com/sirupsen/logrus" ) @@ -105,7 +106,7 @@ func (c *Client) BeginInitParams() bool { } // InitParam initializes the parameter on parameter servers. -func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error { +func (c *Client) InitParam(paramWithConfigs pserver.ParameterWithConfig) error { return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil) } @@ -123,13 +124,13 @@ func (c *Client) FinishInitParams() error { // SendGrads sends gradients to parameter servers for updating // parameters. -func (c *Client) SendGrads(grads []Gradient) error { +func (c *Client) SendGrads(grads []pserver.Gradient) error { if len(grads) == 0 { return errors.New("no gradient received") } errCh := make(chan error, len(grads)) for _, g := range grads { - go func(g Gradient) { + go func(g pserver.Gradient) { err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil) errCh <- err }(g) @@ -151,7 +152,7 @@ func (c *Client) SendGrads(grads []Gradient) error { type result struct { idx int - param Parameter + param pserver.Parameter err error } @@ -170,12 +171,12 @@ func (r results) Swap(i int, j int) { } // GetParams gets parameters from parameter servers. -func (c *Client) GetParams(names []string) ([]Parameter, error) { +func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) { rCh := make(chan result, len(names)) for idx, name := range names { go func(name string, idx int) { - var parameter Parameter + var parameter pserver.Parameter err := c.pservers[c.partition(name)].Call("Service.GetParam", name, ¶meter) rCh <- result{idx: idx, param: parameter, err: err} }(name, idx) @@ -196,7 +197,7 @@ func (c *Client) GetParams(names []string) ([]Parameter, error) { } sort.Sort(rs) - ps := make([]Parameter, len(rs)) + ps := make([]pserver.Parameter, len(rs)) for i := range rs { ps[i] = rs[i].param } diff --git a/go/pserver/client_test.go b/go/pserver/client/client_test.go similarity index 54% rename from go/pserver/client_test.go rename to go/pserver/client/client_test.go index b805efa921..29b400812c 100644 --- a/go/pserver/client_test.go +++ b/go/pserver/client/client_test.go @@ -1,6 +1,7 @@ -package pserver_test +package client_test import ( + "context" "io/ioutil" "net" "net/http" @@ -8,15 +9,25 @@ import ( "strconv" "strings" "testing" + "time" "github.com/PaddlePaddle/Paddle/go/pserver" + "github.com/PaddlePaddle/Paddle/go/pserver/client" + "github.com/coreos/etcd/clientv3" + log "github.com/sirupsen/logrus" ) -const numPserver = 10 +const ( + numPserver = 10 + etcdEndpoints = "127.0.0.1:2379" + timeout = 2 * time.Second +) -var port [numPserver]int +var pserverClientPorts [numPserver]int -func init() { +// this function init pserver client and return their ports in an array. +func initClient() [numPserver]int { + var ports [numPserver]int for i := 0; i < numPserver; i++ { l, err := net.Listen("tcp", ":0") if err != nil { @@ -28,7 +39,7 @@ func init() { if err != nil { panic(err) } - port[i] = p + ports[i] = p go func(l net.Listener) { s, err := pserver.NewService(0) @@ -49,6 +60,31 @@ func init() { } }(l) } + return ports +} + +func initNativeClient() { + pserverClientPorts = initClient() +} + +func initEtcdClient() { + client, err := clientv3.New(clientv3.Config{ + Endpoints: []string{etcdEndpoints}, + DialTimeout: time.Second * time.Duration(1), + }) + if err != nil { + log.Errorf("err %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), timeout) + client.Delete(ctx, pserver.PsDesired) + client.Delete(ctx, pserver.PsPath) + client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver)) + ports := initClient() + for i := 0; i < numPserver; i++ { + client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i])) + } + cancel() + client.Close() } type selector bool @@ -57,25 +93,20 @@ func (s selector) Select() bool { return bool(s) } -type lister []pserver.Server +type lister []client.Server -func (l lister) List() []pserver.Server { +func (l lister) List() []client.Server { return l } -func TestClientFull(t *testing.T) { - servers := make([]pserver.Server, numPserver) - for i := 0; i < numPserver; i++ { - servers[i] = pserver.Server{Index: i, Addr: ":" + strconv.Itoa(port[i])} - } - c := pserver.NewClient(lister(servers), len(servers), selector(true)) +func ClientTest(t *testing.T, c *client.Client) { selected := c.BeginInitParams() if !selected { t.Fatal("should be selected.") } const numParameter = 100 - config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb") + config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb") if err != nil { t.Fatalf("read optimizer proto failed") } @@ -129,3 +160,21 @@ func TestClientFull(t *testing.T) { } } } + +func TestNativeClient(t *testing.T) { + initNativeClient() + servers := make([]client.Server, numPserver) + for i := 0; i < numPserver; i++ { + servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])} + } + c1 := client.NewClient(lister(servers), len(servers), selector(true)) + ClientTest(t, c1) +} + +// TODO: tmperary disable etcdClient test for dependency of etcd) +func EtcdClient(t *testing.T) { + initEtcdClient() + etcd_client := client.NewEtcd(etcdEndpoints) + c2 := client.NewClient(etcd_client, etcd_client.Desired(), selector(true)) + ClientTest(t, c2) +} diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go new file mode 100644 index 0000000000..1fd3479aa8 --- /dev/null +++ b/go/pserver/client/etcd_client.go @@ -0,0 +1,125 @@ +package client + +import ( + "context" + "strconv" + "strings" + "time" + + "github.com/PaddlePaddle/Paddle/go/pserver" + "github.com/coreos/etcd/clientv3" + log "github.com/sirupsen/logrus" +) + +const ( + DefaultEtcdTimeout time.Duration = 5 * time.Second +) + +// EtcdClient is used by pserver client that is a part of trainer process. +// TODO: +// 1. add watcher to watch the change state of pservers) +// 1. add etcd lock) +type EtcdClient struct { + client *clientv3.Client + timeout time.Duration + endpoints []string +} + +// Desired read ps desired number from etcd. +func (p *EtcdClient) Desired() int { + var psDesired int + for { + ctx, cancel := context.WithTimeout(context.Background(), p.timeout) + resp, err := p.client.Get(ctx, pserver.PsDesired) + cancel() + if err != nil { + log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err) + time.Sleep(p.timeout) + continue + } + + kvs := resp.Kvs + if len(kvs) == 0 { + log.Infoln("Waiting for ps desired registered ...") + time.Sleep(p.timeout) + continue + } + + psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value)) + if err != nil { + log.Errorf("psDesired %s invalid %v", psDesired, err) + time.Sleep(p.timeout) + continue + } + + log.Debugf("Get psDesired number: %d", psDesired) + break + } + return psDesired +} + +// List return the pserver list read from etcd. +func (p *EtcdClient) List() []Server { + psDesired := p.Desired() + + servers := make([]Server, psDesired) + for { + for i := 0; i < psDesired; i++ { + ctx, cancel := context.WithTimeout(context.Background(), p.timeout) + cancel() + psKey := pserver.PsPath + strconv.Itoa(i) + log.Debugf("checking %s", psKey) + resp, err := p.client.Get(ctx, psKey) + if err != nil { + log.Infof("Get psKey= %s error, %v", psKey, err) + time.Sleep(p.timeout) + continue + } + kvs := resp.Kvs + if len(kvs) == 0 { + log.Infof("Waiting for ps addr registered ...") + time.Sleep(p.timeout) + continue + } + + psAddr := string(resp.Kvs[0].Value) + // TODO(Longfei) check the ps address + if psAddr == "" { + log.Infof("Get psKey = %s, psAddr is empty", psKey) + time.Sleep(p.timeout) + continue + } + log.Infof("got value (%s) for key: %s", psAddr, psKey) + servers[i].Index = i + servers[i].Addr = psAddr + } + break + } + return servers +} + +// NewEtcd create a etcd client to return the state of pserver on etcd. +func NewEtcd(endpoints string) *EtcdClient { + ep := strings.Split(endpoints, ",") + var cli *clientv3.Client + var err error + for { + cli, err = clientv3.New(clientv3.Config{ + Endpoints: ep, + DialTimeout: DefaultEtcdTimeout, + }) + if err != nil { + log.Errorf("Init etcd connection failed: %v", err) + time.Sleep(DefaultEtcdTimeout) + continue + } + break + } + log.Infof("Connected to etcd: %s\n", endpoints) + client := &EtcdClient{ + client: cli, + timeout: DefaultEtcdTimeout, + endpoints: ep, + } + return client +} diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go index 4d88243edd..37b8d522c1 100644 --- a/go/pserver/etcd_client.go +++ b/go/pserver/etcd_client.go @@ -13,6 +13,13 @@ import ( log "github.com/sirupsen/logrus" ) +const ( + // PsDesired is etcd path for store desired pserver count + PsDesired = "/ps_desired" + // PsAddr is the base dir for pserver to store their addr + PsPath = "/ps/" +) + // EtcdClient is the etcd client that the pserver uses for fault // tolerance, service registry and coordination. type EtcdClient struct { @@ -68,7 +75,7 @@ func (e *EtcdClient) Register() (int, error) { // it at the same time. for { ctx, cancel := context.WithTimeout(context.Background(), time.Second) - _, err := e.initDesiredPsercers(ctx, e.numPservers) + _, err := e.initDesiredPservers(ctx, e.numPservers) cancel() if err != nil { log.Warn(err) @@ -120,7 +127,7 @@ func (e *EtcdClient) Register() (int, error) { return pserverIdx, nil } -func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) { +func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) { return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { dsStr := c.Get(PsDesired) if dsStr == "" { @@ -136,7 +143,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) { _, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { registered := false for i := 0; i < e.desired; i++ { - psKey := "/ps/" + strconv.Itoa(i) + psKey := PsPath + strconv.Itoa(i) log.Debugf("checking %s", psKey) ps := c.Get(psKey) log.Debugf("got value (%s) for key: %s", ps, psKey) diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go index b4a040f46b..bca3718af3 100644 --- a/go/pserver/optimizer.go +++ b/go/pserver/optimizer.go @@ -2,7 +2,7 @@ package pserver // #cgo CFLAGS: -I ../../ // //FIXME: ldflags contain "build" path -// #cgo LDFLAGS: ../../build/go/pserver/cclient/libpaddle_go_optimizer.a -lstdc++ +// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ // #include "paddle/optimizer/optimizer.h" // #include // #include diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go index b99b5a5f0b..0b2f4cfa41 100644 --- a/go/pserver/optimizer_test.go +++ b/go/pserver/optimizer_test.go @@ -11,7 +11,7 @@ func TestOptimizerCreateRelease(t *testing.T) { ElementType: Int32, } p.Content = []byte{1, 3} - config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb") + config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb") if err != nil { t.Fatalf("read optimizer proto failed") } diff --git a/go/pserver/service.go b/go/pserver/service.go index e15a4e5a58..7711dc027e 100644 --- a/go/pserver/service.go +++ b/go/pserver/service.go @@ -24,9 +24,6 @@ const ( Float64 ) -// PsDesired is etcd path for store desired pserver count -const PsDesired = "/ps_desired" - // Parameter is a piece of data to sync with the parameter server. type Parameter struct { Name string diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go index 30e3ac8ae1..b6d20d2c8b 100644 --- a/go/pserver/service_test.go +++ b/go/pserver/service_test.go @@ -10,6 +10,10 @@ import ( "github.com/PaddlePaddle/Paddle/go/pserver" ) +const ( + OptimizerConfig = "./client/c/test/testdata/optimizer.pb" +) + func TestServiceFull(t *testing.T) { s, err := pserver.NewService(0) if err != nil { @@ -19,7 +23,7 @@ func TestServiceFull(t *testing.T) { p.Name = "param_a" p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0} p.ElementType = pserver.Int32 - config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb") + config, err := ioutil.ReadFile(OptimizerConfig) if err != nil { t.Fatalf("read optimizer proto failed") } @@ -149,7 +153,7 @@ func TestBlockUntilInitialized(t *testing.T) { p.Name = "param_a" p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0} p.ElementType = pserver.Int32 - config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb") + config, err := ioutil.ReadFile(OptimizerConfig) if err != nil { t.Fatalf("read optimizer proto failed") } From 3f5e5a24c497714530e8f55f2f076fc4e3168d9c Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 4 Jul 2017 08:16:08 +0000 Subject: [PATCH 43/79] fix cmake error --- .travis.yml | 2 +- go/master/c/CMakeLists.txt | 2 +- go/pserver/optimizer.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index a53bd18094..4f72e2ca33 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ before_install: function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - | - timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout + timeout 2580 paddle/scripts/travis/${JOB}.sh -e "WITH_GOLANG=ON" # 43min timeout RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; notifications: email: diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt index 94d6bb0b2e..d900850be0 100644 --- a/go/master/c/CMakeLists.txt +++ b/go/master/c/CMakeLists.txt @@ -1 +1 @@ -go_library(paddle_master SHARED) +go_library(paddle_master SHARED DEPS paddle_go_optimizer) diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go index bca3718af3..d84f55b987 100644 --- a/go/pserver/optimizer.go +++ b/go/pserver/optimizer.go @@ -2,7 +2,7 @@ package pserver // #cgo CFLAGS: -I ../../ // //FIXME: ldflags contain "build" path -// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ +// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm // #include "paddle/optimizer/optimizer.h" // #include // #include From d8941e67ec5da7333666b31264704dae7d830ca2 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 4 Jul 2017 08:24:28 +0000 Subject: [PATCH 44/79] fix bugs --- .travis.yml | 2 +- paddle/scripts/docker/build.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4f72e2ca33..16432dac0c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ before_install: function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - | - timeout 2580 paddle/scripts/travis/${JOB}.sh -e "WITH_GOLANG=ON" # 43min timeout + export WITH_GOLANG=ON && timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; notifications: email: diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index a182e5f4ae..1ccee686df 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -3,7 +3,7 @@ set -xe # Set BASE_IMAGE according to env variables -if [ ${WITH_GPU} == "ON" ]; then +if [[ ${WITH_GPU} == "ON" ]]; then BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04" else BASE_IMAGE="ubuntu:16.04" From 86543f7f6a8f0fc073977794abee9ae5b033f78e Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 4 Jul 2017 16:40:00 +0800 Subject: [PATCH 45/79] Follow comments. --- doc/api/v2/config/layer.rst | 2 +- paddle/gserver/layers/DetectionOutputLayer.h | 8 +- paddle/gserver/layers/MultiBoxLossLayer.cpp | 6 +- paddle/gserver/layers/MultiBoxLossLayer.h | 2 +- .../paddle/trainer_config_helpers/layers.py | 20 +++-- .../test_detection_output_layer.protostr | 66 ++++++++++++++++ .../test_multibox_loss_layer.protostr | 79 +++++++++++++++++++ 7 files changed, 164 insertions(+), 19 deletions(-) create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 0a8465919d..4f4a9187bc 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -478,6 +478,6 @@ Detection output Layer ====================== detection_output ---- +---------------- .. autoclass:: paddle.v2.layer.detection_output :noindex: diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h index 9cc568219c..a232af0a69 100644 --- a/paddle/gserver/layers/DetectionOutputLayer.h +++ b/paddle/gserver/layers/DetectionOutputLayer.h @@ -22,14 +22,14 @@ limitations under the License. */ namespace paddle { /** - * The detection output layer for a SSD detection task. This layer apply the - * Non-maximum suppression to the all predicted bounding box and keep the + * The detection output layer for a SSD detection task. This layer applies the + * Non-maximum suppression to the all predicted bounding box and keeps the * Top-K bounding boxes. - * - Input: This layer needs three input layers: This first input layer + * - Input: This layer needs three input layers: The first input layer * is the priorbox layer. The rest two input layers are convolution * layers for generating bbox location offset and the classification * confidence. - * - Output: The predict bounding box location. + * - Output: The predict bounding box locations. */ class DetectionOutputLayer : public Layer { diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp index f2d7b8eb1d..bbf1166dce 100644 --- a/paddle/gserver/layers/MultiBoxLossLayer.cpp +++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp @@ -258,8 +258,7 @@ void MultiBoxLossLayer::forward(PassType passType) { } real loss = locLoss_ + confLoss_; MatrixPtr outV = getOutputValue(); - std::vector tmp(batchSize, loss); - outV->copyFrom(&tmp[0], batchSize); + outV->assign(loss); } void MultiBoxLossLayer::backward(const UpdateCallback& callback) { @@ -336,6 +335,9 @@ void MultiBoxLossLayer::backward(const UpdateCallback& callback) { const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n)); const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n)); size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + // only for unittest, there are no width and height information + // when constructing matrix in unittest, so we should + // set the shape in configuration if (!height) height = layerConf.height(); size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); if (!width) width = layerConf.width(); diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h index 9767fed7f1..9935da5644 100644 --- a/paddle/gserver/layers/MultiBoxLossLayer.h +++ b/paddle/gserver/layers/MultiBoxLossLayer.h @@ -30,7 +30,7 @@ namespace paddle { * The loss is composed by the location loss and the confidence loss. * The location loss is a smooth L1 loss and the confidence loss is * a softmax loss. - * - Input: This layer need four input layers: This first input layer + * - Input: This layer needs four input layers: The first input layer * is the priorbox layer and the second layer is a label layer. * The rest two input layers are convolution layers for generating * bbox location offset and the classification confidence. diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 1286ed198e..86e91e2c57 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -1072,10 +1072,10 @@ def multibox_loss_layer(input_loc, :param name: The Layer Name. :type name: basestring - :param input_loc: The input predict location. - :type input_loc: LayerOutput + :param input_loc: The input predict locations. + :type input_loc: LayerOutput | List of LayerOutput :param input_conf: The input priorbox confidence. - :type input_conf: LayerOutput + :type input_conf: LayerOutput | List of LayerOutput :param priorbox: The input priorbox location and the variance. :type priorbox: LayerOutput :param label: The input label. @@ -1146,10 +1146,10 @@ def detection_output_layer(input_loc, :param name: The Layer Name. :type name: basestring - :param input_loc: The input predict location. - :type input_loc: LayerOutput + :param input_loc: The input predict locations. + :type input_loc: LayerOutput | List of LayerOutput. :param input_conf: The input priorbox confidence. - :type input_conf: LayerOutput + :type input_conf: LayerOutput | List of LayerOutput. :param priorbox: The input priorbox location and the variance. :type priorbox: LayerOutput :param num_classes: The number of the classification. @@ -1166,22 +1166,20 @@ def detection_output_layer(input_loc, :type background_id: int :return: LayerOutput """ - input_loc_num = 0 - input_conf_num = 0 - if isinstance(input_loc, LayerOutput): input_loc = [input_loc] assert isinstance(input_loc, collections.Sequence) # list or tuple for each in input_loc: assert isinstance(each, LayerOutput) - input_loc_num += 1 + input_loc_num = len(input_loc) if isinstance(input_conf, LayerOutput): input_conf = [input_conf] assert isinstance(input_conf, collections.Sequence) # list or tuple for each in input_conf: assert isinstance(each, LayerOutput) - input_conf_num += 1 + input_conf_num = len(input_conf) + # Check the input layer number. assert input_loc_num == input_conf_num diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr new file mode 100644 index 0000000000..6690f9852a --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr @@ -0,0 +1,66 @@ +type: "nn" +layers { + name: "input_loc" + type: "data" + size: 16 + active_type: "" + height: 16 + width: 1 +} +layers { + name: "input_conf" + type: "data" + size: 8 + active_type: "" + height: 1 + width: 8 +} +layers { + name: "priorbox" + type: "data" + size: 32 + active_type: "" + height: 4 + width: 8 +} +layers { + name: "test_detection_output" + type: "detection_output" + size: 1400 + active_type: "" + inputs { + input_layer_name: "priorbox" + detection_output_conf { + num_classes: 21 + nms_threshold: 0.45 + nms_top_k: 400 + background_id: 0 + input_num: 1 + keep_top_k: 200 + confidence_threshold: 0.01 + } + } + inputs { + input_layer_name: "input_loc" + } + inputs { + input_layer_name: "input_conf" + } +} +input_layer_names: "priorbox" +input_layer_names: "input_loc" +input_layer_names: "input_conf" +output_layer_names: "test_detection_output" +sub_models { + name: "root" + layer_names: "input_loc" + layer_names: "input_conf" + layer_names: "priorbox" + layer_names: "test_detection_output" + input_layer_names: "priorbox" + input_layer_names: "input_loc" + input_layer_names: "input_conf" + output_layer_names: "test_detection_output" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr new file mode 100644 index 0000000000..0ba84dcc6d --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr @@ -0,0 +1,79 @@ +type: "nn" +layers { + name: "input_loc" + type: "data" + size: 16 + active_type: "" + height: 16 + width: 1 +} +layers { + name: "input_conf" + type: "data" + size: 8 + active_type: "" + height: 1 + width: 8 +} +layers { + name: "priorbox" + type: "data" + size: 32 + active_type: "" + height: 4 + width: 8 +} +layers { + name: "label" + type: "data" + size: 24 + active_type: "" + height: 4 + width: 6 +} +layers { + name: "test_multibox_loss" + type: "multibox_loss" + size: 1 + active_type: "" + inputs { + input_layer_name: "priorbox" + multibox_loss_conf { + num_classes: 21 + overlap_threshold: 0.5 + neg_pos_ratio: 3.0 + neg_overlap: 0.5 + background_id: 0 + input_num: 1 + } + } + inputs { + input_layer_name: "label" + } + inputs { + input_layer_name: "input_loc" + } + inputs { + input_layer_name: "input_conf" + } +} +input_layer_names: "priorbox" +input_layer_names: "label" +input_layer_names: "input_loc" +input_layer_names: "input_conf" +output_layer_names: "test_multibox_loss" +sub_models { + name: "root" + layer_names: "input_loc" + layer_names: "input_conf" + layer_names: "priorbox" + layer_names: "label" + layer_names: "test_multibox_loss" + input_layer_names: "priorbox" + input_layer_names: "label" + input_layer_names: "input_loc" + input_layer_names: "input_conf" + output_layer_names: "test_multibox_loss" + is_recurrent_layer_group: false +} + From e6fcdd47e5c153497efb54ff5737f4fc6a13596d Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 4 Jul 2017 17:22:43 +0800 Subject: [PATCH 46/79] fix wrong including header-file in files in paddle/platform/dynload dir --- paddle/platform/dynload/cublas.h | 4 ++- paddle/platform/dynload/cudnn.h | 4 ++- paddle/platform/dynload/curand.h | 4 ++- paddle/platform/dynload/dynamic_loader.cc | 41 ++++++++++------------- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h index c9150ac573..258cc88031 100644 --- a/paddle/platform/dynload/cublas.h +++ b/paddle/platform/dynload/cublas.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include -#include "paddle/platform/dynamic_loader.h" +#include +#include +#include "paddle/platform/dynload/dynamic_loader.h" namespace paddle { namespace platform { diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h index c03424b375..0a9562c573 100644 --- a/paddle/platform/dynload/cudnn.h +++ b/paddle/platform/dynload/cudnn.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include -#include "paddle/platform/dynamic_loader.h" +#include +#include +#include "paddle/platform/dynload/dynamic_loader.h" namespace paddle { namespace platform { diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h index 1ef7a8c833..9dc0a25c0f 100644 --- a/paddle/platform/dynload/curand.h +++ b/paddle/platform/dynload/curand.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include -#include "paddle/platform/dynamic_loader.h" +#include +#include +#include "paddle/platform/dynload/dynamic_loader.h" namespace paddle { namespace platform { diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc index 8ef67bad8c..dd914e006d 100644 --- a/paddle/platform/dynload/dynamic_loader.cc +++ b/paddle/platform/dynload/dynamic_loader.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "dynamic_loader.h" +#include "paddle/platform/dynload/dynamic_loader.h" #include #include #include #include #include "gflags/gflags.h" #include "glog/logging.h" +#include "paddle/framework/enforce.h" DEFINE_string(cudnn_dir, "", "Specify path for loading libcudnn.so. For instance, " @@ -72,13 +73,12 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, *dso_handle = dlopen(dso_path.c_str(), dynload_flags); if (nullptr == *dso_handle) { if (dso_path == "libcudnn.dylib") { - LOG(FATAL) - << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT - << "For instance, sudo tar -xzf " - "cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT - << "/usr/local \n sudo chmod a+r " - "/usr/local/cuda/include/cudnn.h " // NOLINT - << "/usr/local/cuda/lib/libcudnn*"; + PADDLE_ENFORCE(true, + "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " + "For instance, sudo tar -xzf " + "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " + "chmod a+r /usr/local/cuda/include/cudnn.h " + "/usr/local/cuda/lib/libcudnn*"); } } } @@ -106,22 +106,15 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); } } - - CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath - << " (" << dlerror() << ") \n" - << "Please specify its path correctly using " - "following ways: \n" - - << "Method. set environment variable " - "LD_LIBRARY_PATH on Linux or " - << "DYLD_LIBRARY_PATH on Mac OS. \n" - << "For instance, issue command: export " - "LD_LIBRARY_PATH=... \n" - - << "Note: After Mac OS 10.11, using the " - "DYLD_LIBRARY_PATH is impossible " - << "unless System Integrity Protection (SIP) " - "is disabled."; + PADDLE_ENFORCE(nullptr != *dso_handle, + "Failed to find dynamic library: %s ( %s ) \n Please specify " + "its path correctly using following ways: \n Method. set " + "environment variable LD_LIBRARY_PATH on Linux or " + "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: " + "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " + "using the DYLD_LIBRARY_PATH is impossible unless System " + "Integrity Protection (SIP) is disabled.", + dlPath, dlerror()); } void GetCublasDsoHandle(void** dso_handle) { From 571714159aeb42903fca14d614dcb1e6942b5cc4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 4 Jul 2017 20:04:32 +0800 Subject: [PATCH 47/79] add op_desc.proto (#2736) * add op_desc.proto In Operator design, we need a proto message to describe an Operator. Third-party language such as python can build this proto message and use AddOp(const OpDesc& op_desc) of Paddle core to construct an Op in the Network. --- paddle/framework/CMakeLists.txt | 3 ++ paddle/framework/op_desc.proto | 56 ++++++++++++++++++++++++++++++++ paddle/framework/op_desc_test.cc | 35 ++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 paddle/framework/op_desc.proto create mode 100644 paddle/framework/op_desc_test.cc diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index baad38e3c1..a016f57b3e 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -8,3 +8,6 @@ cc_test(enforce_test SRCS enforce_test.cc) proto_library(attr_type SRCS attr_type.proto) proto_library(op_proto SRCS op_proto.proto DEPS attr_type) cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf) + +proto_library(op_desc SRCS op_desc.proto DEPS attr_type) +cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf) diff --git a/paddle/framework/op_desc.proto b/paddle/framework/op_desc.proto new file mode 100644 index 0000000000..89497f3c16 --- /dev/null +++ b/paddle/framework/op_desc.proto @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax="proto2"; +package paddle.framework; + +import "attr_type.proto"; + +// AttrDesc is used to describe Attributes of an Operator. It contain's +// name, type, and value of Attribute. +// +// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0 +message AttrDesc { + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional float f = 4; + optional string s = 5; + repeated int32 ints = 6; + repeated float floats = 7; + repeated string strings = 8; +}; + +// Protocol Message to describe an Operator. +// +// In PaddlePaddle, Operator is used to do a certain computation such +// as "add", "sub", "cosine", etc. +// (1) Operator needs to know the input and output variable names. +// (2) Some ops may have special attributes such as "scale" in "CosineOp". +// +// 3rd-party language can build this proto message and call +// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator. +message OpDesc { + // input names of this Operator. + repeated string inputs = 1; + + // output names of this Operator. + repeated string outputs = 2; + + // type of this Operator, such as "add", "sub", "fc". + required string type = 3; + + // Attributes of this Operator. e.g., scale=3.0 in cosine op. + repeated AttrDesc attrs = 4; +}; \ No newline at end of file diff --git a/paddle/framework/op_desc_test.cc b/paddle/framework/op_desc_test.cc new file mode 100644 index 0000000000..d0c52523b6 --- /dev/null +++ b/paddle/framework/op_desc_test.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +TEST(OpDesc, Create) { + paddle::framework::OpDesc op_desc; + op_desc.set_type("add"); + op_desc.add_inputs("X"); + op_desc.add_inputs("Y"); + op_desc.add_outputs("Z"); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_type(paddle::framework::AttrType::FLOAT); + attr->set_f(3.14); + + // required field name is not set, so IsInitialized should be false. + ASSERT_FALSE(op_desc.IsInitialized()); + + attr->set_name("add"); + // after all required fields are set, IsInitialized should be true now. + ASSERT_TRUE(op_desc.IsInitialized()); +} \ No newline at end of file From 1ecddd8174fea793e70071163b7e47a750064499 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 4 Jul 2017 21:21:02 +0800 Subject: [PATCH 48/79] Remove buggy BarrierStat The implementation of BarrierStat is buggy, and it is not necessary for Paddle to diagnose which node in cluster is slow. --- paddle/parameter/tests/test_common.cpp | 50 --- paddle/pserver/ParameterServer2.cpp | 215 ------------- paddle/pserver/ParameterServer2.h | 49 --- paddle/utils/BarrierStat.cpp | 340 -------------------- paddle/utils/BarrierStat.h | 425 ------------------------- paddle/utils/Stat.cpp | 61 ---- paddle/utils/Stat.h | 17 - 7 files changed, 1157 deletions(-) delete mode 100644 paddle/utils/BarrierStat.cpp delete mode 100644 paddle/utils/BarrierStat.h diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp index 8bab5a6289..64d204aea1 100644 --- a/paddle/parameter/tests/test_common.cpp +++ b/paddle/parameter/tests/test_common.cpp @@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) { EXPECT_EQ((int)0, nums[i]); } } - -TEST_F(CommonTest, barrierStat) { - const int threadNum = 10; - - SyncThreadPool pool(threadNum); - -#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...) \ - pool.exec([&](int tid, size_t numThreads) { \ - struct timeval time; \ - gettimeofday(&time, nullptr); \ - uint64_t usec = timeToMicroSecond(time); \ - std::srand(usec); \ - auto value = std::rand() % 100000; \ - usleep(value); \ - REGISTER_SLOW_NODES_PROBE( \ - globalStat, statName, numConnThreads, tid, __VA_ARGS__); \ - }); - - for (auto i = 0; i < 10; i++) { - TEST_BARRIER_RANDOM("synThreadBarrier1", threadNum); - TEST_BARRIER_RANDOM("synThreadBarrier2", threadNum); - } - - globalStat.printAllStatus(); - globalStat.reset(); - - for (auto i = 0; i < 10; i++) { - TEST_BARRIER_RANDOM("synThreadBarrier3", threadNum, "tag0"); - TEST_BARRIER_RANDOM("synThreadBarrier4", threadNum, "tag1"); - } - - globalStat.printAllStatus(); - globalStat.reset(); - -// use it to test accurate barrier gap -#define TEST_BARRIER(statName, numConnThreads, ...) \ - pool.exec([&](int tid, size_t numThreads) { \ - usleep(tid * 10000); \ - REGISTER_SLOW_NODES_PROBE( \ - globalStat, statName, numConnThreads, tid, __VA_ARGS__); \ - }); - - for (auto i = 0; i < 10; i++) { - TEST_BARRIER("synThreadBarrier3", threadNum, "tag0"); - TEST_BARRIER("synThreadBarrier4", threadNum, "tag1"); - } - - globalStat.printAllStatus(); - globalStat.reset(); -} diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp index 41ac15336d..d7c1d4f788 100644 --- a/paddle/pserver/ParameterServer2.cpp +++ b/paddle/pserver/ParameterServer2.cpp @@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request, SetConfigResponse response; callback(response); - - /// always defined, barrier slowest node function need it. - statSet_.reset(new StatSet("ParameterServer" + - str::to_string(static_cast(serverId_)))); } real bufferSum(const std::vector& buffers) { @@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, std::vector* outputBuffers) { VLOG(1) << "pserver: addGradient"; - // forwardbackward delta from all trainers - // indicate the fluctuation caused by forwardbackward. - if (!numPassFinishClients_) { - REGISTER_BARRIER_DELTA_SERVER_SET( - *statSet_, - "forwardbackwardDelta", - FLAGS_num_gradient_servers, - request.trainer_id(), - request.forwardbackward_time(), - isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); - } - { - /// approximately pure network overhead - REGISTER_TIMER_DYNAMIC_SET( - "pushRecv", timeToMicroSecond(*handleRequestBegin_), -1, *statSet_); - } - -#ifndef PADDLE_DISABLE_TIMER - gettimeofday(&(*addGradBegin_), nullptr); -#endif - - /// barrier fluctuation caused by network and previous forwardbackward - if (!numPassFinishClients_) { - REGISTER_BARRIER_TIMER_SERVER_SET( - *statSet_, - "handleReqBegin", - FLAGS_num_gradient_servers, - request.trainer_id(), - (*handleRequestBegin_), - isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); - } - - if (!numPassFinishClients_) { - REGISTER_BARRIER_TIMER_SERVER( - *statSet_, - "addGradBegin", - FLAGS_num_gradient_servers, - request.trainer_id(), - isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); - } - - { - REGISTER_TIMER_DYNAMIC("addGradCore", -1, *statSet_); ReadLockGuard guard(parameterMutex_); int bufferIndex = 0; for (const auto& block : request.blocks()) { @@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, std::lock_guard guard(*info.lock); simd::addTo(gradientSumBuffer, gradientBuffer, size); } - - if (!numPassFinishClients_) { - REGISTER_BARRIER_TIMER_SERVER( - *statSet_, - "addGradCoreFinish", - FLAGS_num_gradient_servers, - request.trainer_id(), - isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); - } } if (request.batch_status() == BATCH_FINISH || request.batch_status() == BATCH_START_AND_FINISH) { @@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, VLOG(1) << "num samples: " << numSamplesProcessed_ << ", new cost:" << cost_; - /// numPassFinishClients_ means some trainer has entered finishPass - if (!numPassFinishClients_) { - REGISTER_SLOW_NODES_PROBE( - *statSet_, - "SLOW_NODES", - FLAGS_num_gradient_servers, - request.trainer_id(), - isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); - } - /// notify doOperation gradient ready gradientReadyBarrier_.wait(); - /// if wait pass finish does not start, do check - if (!numPassFinishClients_) { - CHECK_BARRIER_TIMER(*statSet_, - "SLOW_NODES", - FLAGS_num_gradient_servers, - isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); - } - - /// barrier performance while all parameter add is finished - /// can indicate the fluctation caused by computation at pserver. - if (!numPassFinishClients_) { - REGISTER_BARRIER_TIMER_SERVER( - *statSet_, - "paraReady", - FLAGS_num_gradient_servers, - request.trainer_id(), - isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); - } /// wait doOperation finish parameterReadyBarrier_.wait(); VLOG(1) << "start send back"; - { - /// total time except overhead of network. - REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend", - timeToMicroSecond(*addGradBegin_), - -1, - *statSet_); - } } } @@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat( return commitGradient; } -void ParameterServer2::printAsyncGradientCommitStatAndReset() { - std::stringstream statFormat; - if (asyncUpdateSteps_) { - statFormat << "async discard gradients stat: " << std::endl; - statFormat << "serverId: " << serverId_ - << " serverType: " << isSparseServer_ - << " total updates: " << asyncUpdateSteps_ - << " discard updates: " << asyncLaggedGradientsNum_ - << " discard ratio: " - << (real)asyncLaggedGradientsNum_ / (real)asyncUpdateSteps_; - statFormat << std::endl; - statFormat << std::endl; - - statFormat << "Async Gradient Update Steps distribution: " << std::endl - << "Sample: 1:1912(0.00284449) means " - << "the updates step=1 count 1912 times " - << "and account for 0.284449% of total updates" << std::endl; - size_t index = 0; - for (const auto& stat : asyncUpdateStat_) { - statFormat << index << ":" << stat << "(" - << (real)stat / (real)asyncUpdateSteps_ << ") "; - index++; - } - statFormat << std::endl; - statFormat << std::endl; - - statFormat << "Async Gradient Discard based on trainer_id: " << std::endl - << "Sample: 2:22(0.0016363) means " - << "total discarded updates from trainer_id=2 count 22 " - << "and account for 0.16363% of all updates from trainer_id=2" - << std::endl; - for (auto i = 0; i < FLAGS_num_gradient_servers; i++) { - real ratio = - (real)asyncTrainerDiscardStat_[i] / - (real)(asyncTrainerCommitStat_[i] + asyncTrainerDiscardStat_[i]); - statFormat << i << ":" << asyncTrainerDiscardStat_[i] << "(" << ratio - << ")" - << " "; - } - LOG(INFO) << statFormat.str(); - - /// reset stat - asyncUpdateSteps_ = 0; - asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0); - asyncLaggedGradientsNum_ = 0; - asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0); - asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0); - asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0); - } -} - static ThreadLocal> localBlockBitset_; void ParameterServer2::asyncSGD(const SendParameterRequest& request, @@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request, if (request.trainer_id() == 0) { /// batchId_ is approximately equal to "real batchId_" batchId_++; - tuningAsyncsgdMidOutput(); } } @@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request, } (*requestVec_).clear(); (*callbackVec_).clear(); - - /// barrier perfromance while all data are send finished. - /// indicates network flucatuation for big message. - if (!numPassFinishClients_) { - REGISTER_BARRIER_TIMER_SERVER( - *statSet_, - "sendParamFinish", - FLAGS_num_gradient_servers, - request.trainer_id(), - isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); - } - /// all time exhausted in parameterServer for big message. - /// it contains network and computation at pserver. - { - /// total time including overhead of network. - REGISTER_TIMER_DYNAMIC_SET("sendParaTotal", - timeToMicroSecond(*handleRequestBegin_), - -1, - *statSet_); - } - /// all time exhausted in pserverServer except recieve network. - { - /// total time except overhead of network receive - REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv", - timeToMicroSecond(*addGradBegin_), - -1, - *statSet_); - } } break; case PSERVER_UPDATE_MODE_SET_PARAM: @@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation, } { - REGISTER_TIMER_DYNAMIC("op_SGD", -1, *statSet_); - parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) { BlockInfo& info = blockInfos_[blockId]; const ParameterConfig& config = getParameterConfig(blockId); @@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation, } batchId_++; - tuningSgdMidOutput(); } void ParameterServer2::op_start_pass(const Operation& operation, @@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation, /// finish pass info.optimizer->finishPass(); }); - - tuningSgdFinished(); batchId_ = 0; } @@ -1515,7 +1339,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request, callback(SynchronizeResponse()); if (request.trainer_id() == 0) { - tuningAsyncsgdFinished(); batchId_ = 0; } } @@ -1574,42 +1397,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request, callback(response); } -void ParameterServer2::tuningSgdMidOutput() { - if (batchId_ && batchId_ % FLAGS_log_period_server == 0) { - LOG(INFO) << "======== Batch=" << batchId_ << "======="; - statSet_->setThreadInfo(true); - statSet_->printAllStatus(); - /// not reset raw data for reducing the overhead of performance tuning - statSet_->reset(false); - } -} - -void ParameterServer2::tuningSgdFinished() { - LOG(INFO) << "======== Batch=" << batchId_ << " pass END" - << "======="; - statSet_->setThreadInfo(true); - statSet_->printAllStatus(); - /** - * reset raw data at end of pass since some raw data could be not - * complete. Otherwise the raw data will pollute next pass performance - * tuning - */ - statSet_->reset(); -} - -void ParameterServer2::tuningAsyncsgdMidOutput() { -#ifndef PADDLE_DISABLE_TIMER - if (batchId_ && batchId_ % FLAGS_log_period_server == 0) { - LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << "======="; - printAsyncGradientCommitStatAndReset(); - } -#endif -} - -void ParameterServer2::tuningAsyncsgdFinished() { - LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << " pass END" - << "======="; - printAsyncGradientCommitStatAndReset(); -} - } // namespace paddle diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h index 0f5a589590..f7d3587b88 100644 --- a/paddle/pserver/ParameterServer2.h +++ b/paddle/pserver/ParameterServer2.h @@ -298,24 +298,6 @@ protected: /// barrier performance tuning sync-sgd required std::atomic batchId_; - /// the beginning of addGradient without network overhead - ThreadLocal addGradBegin_; - - /** - * tuning barrier performance - * to better control log for sparse and dense parameter, - * we use different log entities for different parameterServer - * objects. - * it will output lots of performance stats to perceive the - * overhead of network, fluctuation of computation from - * forwardbackward and network, computation from optimization - * at pserver end, barrier overhead, etc. to understand tuning - * data, focus on the synchronization between addGradient and - * doOperation which indirectly call op_SGD operation controlled - * by remote updater controller - */ - std::unique_ptr statSet_; - public: struct Buffer { real* base; @@ -325,7 +307,6 @@ public: protected: /// async gradient commit control bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request); - void printAsyncGradientCommitStatAndReset(); public: /// disable default parameter for overloading @@ -710,36 +691,6 @@ public: void op_load(const Operation& operation, OperationResult* result); void op_save(const Operation& operation, OperationResult* result); - - /** - * @brief output log in at the middle stage of training - * - * @note flush log histroy and state at the end for sgd - */ - void tuningSgdMidOutput(); - - /** - * @brief output log in at the end stage of training - * - * @note flush log histroy and state at the end for sgd. it will also - * flush some stateful stat for next pass. - */ - void tuningSgdFinished(); - - /** - * @brief output log in at the middle stage of training - * - * @note flush log histroy and state at the end for async-sgd. - * it will log some performance log if some lagged node are found - */ - void tuningAsyncsgdMidOutput(); - - /** - * @brief output log in at the end stage of training - * - * @note flush log histroy and state at the end for async-sgd. - */ - void tuningAsyncsgdFinished(); }; } // namespace paddle diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp deleted file mode 100644 index a6dbdcae3f..0000000000 --- a/paddle/utils/BarrierStat.cpp +++ /dev/null @@ -1,340 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/utils/BarrierStat.h" -#include -#include -#include -#include -#include "paddle/utils/Flags.h" -#include "paddle/utils/Stat.h" - -DEFINE_bool(log_barrier_abstract, - true, - "if true, show abstract of barrier performance"); -DEFINE_int32(log_barrier_lowest_nodes, - 5, - "how many lowest node will be logged"); -DEFINE_bool(log_barrier_show_log, - false, // for performance tuning insight - "if true, always show barrier abstract even with little gap"); - -namespace paddle { - -std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) { - if (FLAGS_log_barrier_abstract) { - std::lock_guard guard(stat.lock_); - stat.showAbstract(output); - } - return output; -} - -BarrierStatBase::BarrierStatBase(uint16_t numConnThreads, - const std::string &name) - : totSamples_(0), numConnThreads_(numConnThreads), name_(name) { - abstract_.resize(numConnThreads_); - if (FLAGS_log_barrier_show_log) { - rateThreshold_ = 0.0; - } else { - /* probablity of abnormal node - * p = 1/n + (n/8)/(n+1), n = nodes, n > 1 - * if the freq of lowest trainerId larger than p, - * output FLAGS_log_barrier_lowest_nodes lastTrainerId. - * numConnThreads_ indicates nodes - */ - float n = (float)numConnThreads; - rateThreshold_ = 1.0 / n + (n / 8.0) / (n + 1.0); - } -} - -BarrierEndStat::BarrierEndStat(uint16_t numConnThreads, const std::string &name) - : BarrierStatBase(numConnThreads, name) { - timeVector_.reset(new TimeVectorEnd(numConnThreads_)); - reset(true); - LOG(INFO) << " create barrierEndStat: " << name - << " endBarrier warning rate: " << rateThreshold_; -} - -/* - * Note: - * the design different pserver entity owns different statSet to obey - * the background that different pserver runs separately. - */ -void BarrierEndStat::updateStat(struct timeval &cur, int32_t trainerId) { - CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier"; - - std::lock_guard guard(lock_); - timeVector_->addTimeval(cur, trainerId); - - if (timeVector_->full()) { - std::lock_guard abstractGuard(abstractLock_); - auto id = timeVector_->getLastTrainerId(); - auto delta = timeToMicroSecond(timeVector_->getDelta()); - auto secondDelta = timeToMicroSecond(timeVector_->get1NDelta()); - auto lastTwoDelta = timeToMicroSecond(timeVector_->getMinus1NDelta()); - auto midDelta = timeToMicroSecond(timeVector_->getMidNDelta()); - // discard first sample, since first sample probably is abnormal. - if (totSamples_) { - abstract_[id].freq++; - - if (delta < abstract_[id].minDelta) { - abstract_[id].minDelta = delta; - } - if (delta > abstract_[id].maxDelta) { - abstract_[id].maxDelta = delta; - } - abstract_[id].totDelta += delta; - abstract_[id].totSecondDelta += secondDelta; - abstract_[id].totLastTwoDelta += lastTwoDelta; - abstract_[id].totMidDelta += midDelta; - - // update totAbstract_ - totAbstract_.freq++; - if (delta < totAbstract_.minDelta) { - totAbstract_.minDelta = delta; - } - if (delta > totAbstract_.maxDelta) { - totAbstract_.maxDelta = delta; - } - totAbstract_.totDelta += delta; - totAbstract_.totSecondDelta += secondDelta; - totAbstract_.totLastTwoDelta += lastTwoDelta; - totAbstract_.totMidDelta += midDelta; - } - - totSamples_++; - timeVector_->reset(); - } -} - -void BarrierEndStat::reset(bool clearRawData) { - int32_t i = 0; - - totSamples_ = 0; - - std::lock_guard guard(abstractLock_); - - if (clearRawData) { - timeVector_->reset(); - } - - for (auto &abstract : abstract_) { - memset((void *)&abstract, 0, sizeof(abstract)); - abstract.minDelta = UINT64_MAX; - abstract.trainerId = i++; - } - memset((void *)&totAbstract_, 0, sizeof(Abstract)); - totAbstract_.minDelta = UINT64_MAX; -} - -void BarrierEndStat::showAbstract(std::ostream &output) const { - // do not support the case "<=2 pserver" - if (numConnThreads_ <= 2 || !totSamples_) { - return; - } - - // duplicate freq info - std::vector outputAbstract = abstract_; - std::sort(outputAbstract.begin(), - outputAbstract.end(), - [](const struct Abstract &a, const struct Abstract &b) { - return a.freq > b.freq; - }); - - auto rate = (float)outputAbstract[0].freq / (float)totSamples_; - if (rate < rateThreshold_) { - return; - } - - output << std::setw(20) << name_ << std::endl; - - /* - * Note: - * avgGap: the average delta between 1 -- n arriving trainers - * avgSecondGap: the average delta between 2 -- n arriving trainers - * avgLastTwoGap: the average delta between n-1 -- n arriving trainers - * avgMidGap: the average delta between n/2 -- n arriving trainers - * rato: samples / totSamples - * - * the stat is based on per trainer if trainer_id is set, totAbstract is - * stat based on all trainers scope. - */ - output << std::setw(42) << " " << std::setw(15) << "trainerId" - << std::setw(15) << "avgGap" << std::setw(15) << "avgSecondGap" - << std::setw(15) << "avgLastTwoGap" << std::setw(15) << "avgMidGap" - << std::setw(10) << "rate" << std::setw(10) << "samples" - << std::setw(10) << "totSamples" << std::endl; - // show totAbstract, it's valuable when lastTrainerId is even-distributed' - if (!totAbstract_.freq) return; - output << std::setw(42) << " " << std::setw(15) << "totAbstract" - << std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001 - << std::setw(15) - << (totAbstract_.totSecondDelta / totAbstract_.freq) * 0.001 - << std::setw(15) - << (totAbstract_.totLastTwoDelta / totAbstract_.freq) * 0.001 - << std::setw(15) - << (totAbstract_.totMidDelta / totAbstract_.freq) * 0.001 - << std::setw(10) << (float)totAbstract_.freq / (float)totSamples_ - << std::setw(10) << (float)totAbstract_.freq << std::setw(10) - << (float)totSamples_ << std::endl; - - // show lastTrainerId abstract - int count = 0; - for (auto &abstract : outputAbstract) { - if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) { - break; - } - // output format control - output << std::setw(42) << " " << std::setw(15) << abstract.trainerId - << std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001 - << std::setw(15) << (abstract.totSecondDelta / abstract.freq) * 0.001 - << std::setw(15) - << (abstract.totLastTwoDelta / abstract.freq) * 0.001 - << std::setw(15) << (abstract.totMidDelta / abstract.freq) * 0.001 - << std::setw(10) << (float)abstract.freq / (float)totSamples_ - << std::setw(10) << (float)abstract.freq << std::setw(10) - << (float)totSamples_ << std::endl; - } -} - -BarrierDeltaStat::BarrierDeltaStat(uint16_t numConnThreads, - const std::string &name) - : BarrierStatBase(numConnThreads, name) { - timeVector_.reset(new TimeVectorDelta(numConnThreads_)); - reset(true); - LOG(INFO) << " create barrierDeltaStat: " << name - << " barrierDelta warning rate: " << rateThreshold_; -} - -void BarrierDeltaStat::updateStat(uint64_t delta, int32_t trainerId) { - CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier"; - - std::lock_guard guard(lock_); - timeVector_->addTimeval(delta, trainerId); - - if (timeVector_->full()) { - std::lock_guard abstractGuard(abstractLock_); - auto id = timeVector_->getMaxTrainerId(); - auto delta = timeVector_->getDelta(); - // discard first sample, since first sample probably is abnormal. - if (totSamples_) { - abstract_[id].freq++; - - if (delta < abstract_[id].minDelta) { - abstract_[id].minDelta = delta; - } - if (delta > abstract_[id].maxDelta) { - abstract_[id].maxDelta = delta; - } - abstract_[id].totDelta += delta; - - // update totAbstract_ - totAbstract_.freq++; - if (delta < totAbstract_.minDelta) { - totAbstract_.minDelta = delta; - } - if (delta > totAbstract_.maxDelta) { - totAbstract_.maxDelta = delta; - } - totAbstract_.totDelta += delta; - } - - totSamples_++; - timeVector_->reset(); - } -} - -void BarrierDeltaStat::reset(bool clearRawData) { - int32_t i = 0; - - totSamples_ = 0; - - std::lock_guard guard(abstractLock_); - - if (clearRawData) { - timeVector_->reset(); - } - - for (auto &abstract : abstract_) { - memset((void *)&abstract, 0, sizeof(abstract)); - abstract.minDelta = UINT64_MAX; - abstract.trainerId = i++; - } - memset((void *)&totAbstract_, 0, sizeof(Abstract)); - totAbstract_.minDelta = UINT64_MAX; -} - -void BarrierDeltaStat::showAbstract(std::ostream &output) const { - // do not support the case "<=2 pserver" - if (numConnThreads_ <= 2 || !totSamples_) { - return; - } - - // duplicate freq info - std::vector outputAbstract = abstract_; - std::sort(outputAbstract.begin(), - outputAbstract.end(), - [](const struct Abstract &a, const struct Abstract &b) { - return a.freq > b.freq; - }); - - auto rate = (float)outputAbstract[0].freq / (float)totSamples_; - if (rate < rateThreshold_) { - return; - } - - output << std::setw(20) << name_ << std::endl; - - /* Note: - * Gap means the delta from all trainers' forwardbackward - * avgGap: average Gap in log_period batches - * minGap: min Gap in log_period batches - * maxGap: max Gap in log_period batches - * trainerId: the slowest trainer_id - * - * the stat is based on per trainer if trainer_id is set, totAbstract is - * stat based on all trainers scope. - */ - output << std::setw(42) << " " << std::setw(15) << "trainerId" - << std::setw(15) << "avgGap" << std::setw(10) << "minGap" - << std::setw(10) << "maxGap" << std::setw(10) << "rate" - << std::setw(10) << "samples" << std::setw(10) << "totSamples" - << std::endl; - // show totAbstract, it's valuable when lastTrainerId is even-distributed' - if (!totAbstract_.freq) return; - output << std::setw(42) << " " << std::setw(15) << "totAbstract" - << std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001 - << std::setw(10) << totAbstract_.minDelta * 0.001 << std::setw(10) - << totAbstract_.maxDelta * 0.001 << std::setw(10) - << (float)totAbstract_.freq / (float)totSamples_ << std::setw(10) - << (float)totAbstract_.freq << std::setw(10) << (float)totSamples_ - << std::endl; - - // show lastTrainerId abstract - int count = 0; - for (auto &abstract : outputAbstract) { - if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) { - break; - } - // output format control - output << std::setw(42) << " " << std::setw(15) << abstract.trainerId - << std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001 - << std::setw(10) << abstract.minDelta * 0.001 << std::setw(10) - << abstract.maxDelta * 0.001 << std::setw(10) - << (float)abstract.freq / (float)totSamples_ << std::setw(10) - << (float)abstract.freq << std::setw(10) << (float)totSamples_ - << std::endl; - } -} -} // namespace paddle diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h deleted file mode 100644 index a9c925eff6..0000000000 --- a/paddle/utils/BarrierStat.h +++ /dev/null @@ -1,425 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Locks.h" -#include "Logging.h" -#include "ThreadLocal.h" - -namespace paddle { - -inline uint64_t timeToMicroSecond(struct timeval time) { - return time.tv_sec * 1000000LU + time.tv_usec; -} - -class TimeVectorEnd { - /* - * help class for gathering all barrier performance data - * which shows time point property. - * freqently used in barrier performance tuning API, such - * as tuning which is slowest node in sync-sgd mode training. - */ -public: - explicit TimeVectorEnd(uint16_t size) : size_(size) { - index_ = 0; - timeArray_.resize(size); - trainerIds_.resize(size); - } - ~TimeVectorEnd() {} - - uint16_t size() { return size_; } - - bool full() { return index_ == size_; } - - bool empty() { return index_ == 0; } - - void reset() { index_ = 0; } - - void addTimeval(struct timeval time, int32_t trainerId) { - timeArray_[index_] = time; - trainerIds_[index_] = trainerId; - index_++; - } - - struct timeval getDelta() const { - struct timeval delta; - CHECK_GT(size_, 1) << "not support with 1 pserver"; - timersub(&timeArray_[size_ - 1], &timeArray_[0], &delta); - return delta; - } - - /* 2, n delta */ - struct timeval get1NDelta() const { - CHECK_GT(size_, 2) << "not support with less than 2 pservers"; - struct timeval delta; - timersub(&timeArray_[size_ - 1], &timeArray_[1], &delta); - return delta; - } - - /* n-1, n delta */ - struct timeval getMinus1NDelta() const { - CHECK_GT(size_, 2) << "not support with less than 2 pservers"; - struct timeval delta; - timersub(&timeArray_[size_ - 1], &timeArray_[size_ - 2], &delta); - return delta; - } - - /* n/2, n delta */ - struct timeval getMidNDelta() const { - CHECK_GT(size_, 2) << "not support with less than 2 pservers"; - struct timeval delta; - timersub(&timeArray_[size_ - 1], &timeArray_[size_ / 2], &delta); - return delta; - } - - int32_t getLastTrainerId() const { return trainerIds_[index_ - 1]; } - -private: - uint16_t size_; - uint16_t index_; - std::vector timeArray_; - std::vector trainerIds_; -}; - -class TimeVectorDelta { - /* - * help class for gathering performance data which shows time - * delta property, such as tuning the time distribution of - * forwardBackward time from all cluster nodes. - */ -public: - explicit TimeVectorDelta(uint16_t size) - : size_(size), min_(UINT64_MAX), max_(0) { - index_ = 0; - timeArray_.resize(size); - } - ~TimeVectorDelta() {} - - uint16_t size() { return size_; } - - bool full() { return index_ == size_; } - - bool empty() { return index_ == 0; } - - void reset() { - index_ = 0; - min_ = UINT64_MAX; - max_ = 0; - } - - void addTimeval(uint64_t delta, int32_t trainerId) { - timeArray_[index_] = delta; - index_++; - if (delta < min_) { - min_ = delta; - } - if (delta > max_) { - max_ = delta; - maxTrainerId_ = trainerId; - } - } - - uint64_t getDelta() const { - CHECK_GT(size_, 1) << "not support with 1 pserver"; - return max_ - min_; - } - - /* 2, n delta */ - uint64_t get1NDelta() const { - CHECK_GT(size_, 2) << "not support with less than 2 pservers"; - LOG(FATAL) << "Not implemented"; - } - - /* n-1, n delta */ - uint64_t getMinus1NDelta() const { - CHECK_GT(size_, 2) << "not support with less than 2 pservers"; - LOG(FATAL) << "Not implemented"; - } - - /* n/2, n delta */ - uint64_t getMidNDelta() const { - CHECK_GT(size_, 2) << "not support with less than 2 pservers"; - LOG(FATAL) << "Not implemented"; - } - - int32_t getMaxTrainerId() const { return maxTrainerId_; } - -private: - uint16_t size_; - uint16_t index_; - std::vector timeArray_; - -private: - uint64_t min_; - uint64_t max_; - int32_t maxTrainerId_; -}; - -// total samples stats, us -struct Abstract { - // last trainerId for barrier end, maxDelta trainerId for barrier delta - int32_t trainerId; - uint64_t minDelta; - uint64_t maxDelta; - uint64_t totDelta; - // first one is probably itself, so discard it. - uint64_t totSecondDelta; - // to confirm if last node destroy barrier performance. - uint64_t totLastTwoDelta; - // n/2-n delta - uint64_t totMidDelta; - uint64_t freq; -}; - -// barrier performance tunning stats -class BarrierStatBase { -public: - BarrierStatBase(uint16_t numConnThreads, const std::string &name); - - virtual ~BarrierStatBase() {} - - // if called at pserver end, then trainId means trainer's id. - // by default trainer does not use trainerId, so set it to -1 - virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) = 0; - virtual void updateStat(uint64_t delta, int32_t trainerId = -1) = 0; - - const std::string &getName() { return name_; } - - virtual void reset(bool clearRawData = true) {} - // since the timeVector_ is not stateful, so it's not clear whether the - // the barrier delta is correct. if one timestamp was lost, the all data - // from barrier stat becomes rubbish. -_- - virtual bool checkPassBarrier() { - LOG(INFO) << "bug implementation found"; - return false; - } - -protected: - virtual void showAbstract(std::ostream &output) const {} - friend std::ostream &operator<<(std::ostream &output, - const BarrierStatBase &stat); - -protected: - mutable std::mutex lock_; - std::mutex abstractLock_; // see note on updaterStat - // each freqency for each barrier trainer - std::vector abstract_; - // it is valuable when do perf-tuining, if lastTrainerId acts uniform - // distribution - struct Abstract totAbstract_; - uint64_t totSamples_; - -protected: - uint16_t numConnThreads_; // total updates needed - float rateThreshold_; - std::string name_; -}; - -// the end-time of arriving real/forged barrier position -class BarrierEndStat : public BarrierStatBase { -public: - BarrierEndStat(uint16_t numConnThreads, const std::string &name); - ~BarrierEndStat() {} - - virtual void updateStat(struct timeval &cur, int32_t trainerId = -1); - virtual void updateStat(uint64_t delta, int32_t trainerId = -1) { - LOG(INFO) << "have no delta updateStat in BarrierEndStat"; - } - virtual void reset(bool clearRawData = true); - virtual bool checkPassBarrier() { return timeVector_->empty(); } - -protected: - /* - * LOG: - * readAllBlocks_denseUpdater - * trainerId avgGap avgSecondGap avgLastTwoGap avgMidGap rate - * 44 86.702 81.022 9.984 50.472 0.144737 - * 46 87.723 82.939 8.737 50.019 0.118421 - * 35 100.923 96.752 14.305 61.979 - * 0.0657895 - * log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold - * control details. - */ - virtual void showAbstract(std::ostream &output) const; - -private: - std::unique_ptr timeVector_; -}; - -// the delta-time from different trainers, -// eg, find the degree of imbalance of BP time at pserver end -// the entry value in timerVector_ is BP delta, do evaluation to BP delta. -class BarrierDeltaStat : public BarrierStatBase { -public: - BarrierDeltaStat(uint16_t numConnThreads, const std::string &name); - ~BarrierDeltaStat() {} - - virtual void updateStat(uint64_t delta, int32_t trainerId = -1); - virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) { - LOG(INFO) << "have no timeval updateStat in BarrierDeltaStat"; - } - - virtual void reset(bool clearRawData = true); - - virtual bool checkPassBarrier() { return timeVector_->empty(); } - -protected: - virtual void showAbstract(std::ostream &outPut) const; - -private: - // store delta time in uint64_t, eg BP time of all trainers - std::unique_ptr timeVector_; -}; - -// to distinguish different contexts for same parallel threads, and different -// threads with same code-sgement, just use tagName to tag the run-time -// position. -// in Sparse, sendParallel threads can not only run in the stage of push&pull -// with same thread group, but also run in the stage of pull&push with different -// thread group, tag will be used to distinguish different run-time barrier -// position. -// trainerId in REGISTER_BARRIER_TIMER_SERVER is used to retreive lowest trainer -// nodes. - -// end barrier -#define __REGISTER_BARRIER_TIMER_SERVER( \ - set, statName, numConnThreads, trainerId, ...) \ - do { \ - if (numConnThreads > 2) { \ - std::string internalName = \ - std::string(statName) + std::string(__VA_ARGS__); \ - BarrierStatPtr __stat = \ - (set).getStat(numConnThreads, internalName, BARRIER_END); \ - struct timeval cur; \ - gettimeofday(&cur, nullptr); \ - __stat->updateStat(cur, trainerId); \ - } \ - } while (0); - -// end barrier with user-defined timer -#define __REGISTER_BARRIER_TIMER_SERVER_SET( \ - set, statName, numConnThreads, trainerId, cur, ...) \ - do { \ - if (numConnThreads > 2) { \ - std::string internalName = \ - std::string(statName) + std::string(__VA_ARGS__); \ - BarrierStatPtr __stat = \ - (set).getStat(numConnThreads, internalName, BARRIER_END); \ - __stat->updateStat(cur, trainerId); \ - } \ - } while (0); - -// delta barrier -#define __REGISTER_BARRIER_DELTA_SERVER_SET( \ - set, statName, numConnThreads, trainerId, delta, ...) \ - do { \ - if (numConnThreads > 2) { \ - std::string internalName = \ - std::string(statName) + std::string(__VA_ARGS__); \ - BarrierStatPtr __stat = \ - (set).getStat(numConnThreads, internalName, BARRIER_DELTA); \ - __stat->updateStat(delta, trainerId); \ - } \ - } while (0); - -// check end barrier -#define __CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \ - do { \ - std::string internalName = \ - std::string(statName) + std::string(__VA_ARGS__); \ - BarrierStatPtr __stat = \ - (set).getStat(numConnThreads, internalName, BARRIER_END); \ - PCHECK(__stat->checkPassBarrier()) << internalName \ - << ": invalid barrier data"; \ - } while (0); - -/* - * Note: - * with sync-sgd algriothm in cluster mode, lots of synchronize action exsit at - * pserve end. these synchronizaton actions have impact on the efficiency of - * parameter exchange. the synchronizaton(barrier) GAP is composed of lots of - * factors, such as the forwardBackward variance, network fluncation. we try - * to have a quantitative analysis on these factor, so we design lots of barrier - * time to capture these performance. these barrier also can be placed at - * implict barrier position. - * - * example: - * in sync-sgd algorithm, each parameter server waits for all gradients from - * all trainers, thus, an explict barrier point exsit before doing optimization. - * the barrier timer located before the point can sense the barrier condition. - * - */ - -// try to capture which trainer is slowest node in sync-sgd at pserver. -#define REGISTER_SLOW_NODES_PROBE( \ - set, statName, numConnThreads, trainerId, ...) \ - __REGISTER_BARRIER_TIMER_SERVER( \ - (set), statName, numConnThreads, trainerId, __VA_ARGS__) -// try to check if all threads or trainers have passed barriers for data -// accuracy. -#define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \ - __CHECK_BARRIER_TIMER((set), statName, numConnThreads, __VA_ARGS__) - -#ifdef PADDLE_DISABLE_TIMER - -#define REGISTER_BARRIER_TIMER_SERVER( \ - set, statName, numConnThreads, trainerId, ...) -#define REGISTER_BARRIER_TIMER_SERVER_SET( \ - set, statName, numConnThreads, trainerId, cur, ...) -#define REGISTER_BARRIER_DELTA_SERVER_SET( \ - set, statName, numConnThreads, trainerId, cur, ...) - -#else - -/* - * sensing barrier time distribution for all parallelization threads. - * it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE) - */ -#define REGISTER_BARRIER_TIMER_SERVER( \ - set, statName, numConnThreads, trainerId, ...) \ - __REGISTER_BARRIER_TIMER_SERVER( \ - (set), statName, numConnThreads, trainerId, __VA_ARGS__) - -/* - * sensing barrier time distribution for all parallelization threads. - * but time point for barrier performance is set by user. - * eg, with this api, you can get implict barrier point such as the beginning - * time distribution - * for receiving data. - */ -#define REGISTER_BARRIER_TIMER_SERVER_SET( \ - set, statName, numConnThreads, trainerId, cur, ...) \ - __REGISTER_BARRIER_TIMER_SERVER_SET( \ - (set), statName, numConnThreads, trainerId, cur, __VA_ARGS__) - -// try to capture time delta from all trainers, such as forwardBackward time -// which implies -// computation fluctuation -#define REGISTER_BARRIER_DELTA_SERVER_SET( \ - set, statName, numConnThreads, trainerId, delta, ...) \ - __REGISTER_BARRIER_DELTA_SERVER_SET( \ - (set), statName, numConnThreads, trainerId, delta, __VA_ARGS__) - -#endif // DISABLE_TIMER -} // namespace paddle diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp index c7194d3bf1..ff1b1bf888 100644 --- a/paddle/utils/Stat.cpp +++ b/paddle/utils/Stat.cpp @@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) { return outPut; } -BarrierStatPtr StatSet::getStat(uint16_t numConnThreads, - const std::string& name, - BarrierStatType bType) { - { - ReadLockGuard guard(lock_); - auto it = barrierStatSet_.find(name); - if (it != barrierStatSet_.end()) { - return it->second; - } - } - - std::lock_guard guard(lock_); - // test again with lock_guard - auto it = barrierStatSet_.find(name); - if (it != barrierStatSet_.end()) { - return it->second; - } - - BarrierStatPtr stat; - if (bType == BARRIER_END) { - stat = std::make_shared(numConnThreads, name); - } else if (bType == BARRIER_DELTA) { - stat = std::make_shared(numConnThreads, name); - } - auto ret = barrierStatSet_.insert(std::make_pair(name, stat)); - return ret.first->second; -} - void StatSet::printSegTimerStatus() { ReadLockGuard guard(lock_); LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') @@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() { } } -void StatSet::printBarrierTimerStatus() { - ReadLockGuard guard(lock_); - if (barrierStatSet_.empty()) { - return; - } - // control barrierAbstact in runtime, so enable compliation - LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') - << "======= BarrierStatSet status ======" << std::endl; - for (auto& stat : barrierStatSet_) { - LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') - << *(stat.second); - } -} - void StatSet::printAllStatus() { #ifndef PADDLE_DISABLE_TIMER printSegTimerStatus(); #endif - printBarrierTimerStatus(); LOG(INFO) << std::setiosflags(std::ios::left) << "--------------------------------------------------" << std::endl; } -void StatSet::printStatus(const std::string& name) { - ReadLockGuard guard(lock_); - auto iter = statSet_.find(name); - CHECK(iter != statSet_.end()) << name << " is not registed in " << name_; - LOG(INFO) << *(iter->second); -} - void StatSet::reset(bool clearRawData) { ReadLockGuard guard(lock_); for (auto& stat : statSet_) { stat.second->reset(); } - // reset barrierStat - for (auto& stat : barrierStatSet_) { - stat.second->reset(clearRawData); - } } void StatSet::setThreadInfo(const std::string& name, bool flag) { @@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) { iter->second->setThreadInfo(flag); } -void StatSet::deleteStat(const std::string& name) { - std::lock_guard guard(lock_); - auto iter = statSet_.find(name); - CHECK(iter != statSet_.end()) << name << " is not registed in " << name_; - statSet_.erase(iter); -} - StatInfo::~StatInfo() { if (stat_) { std::lock_guard guard(stat_->lock_); diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h index d9cc6e413a..79fd3b8cf0 100644 --- a/paddle/utils/Stat.h +++ b/paddle/utils/Stat.h @@ -23,7 +23,6 @@ limitations under the License. */ #include #include -#include "BarrierStat.h" #include "Locks.h" #include "Logging.h" #include "ThreadLocal.h" @@ -60,12 +59,6 @@ public: class Stat; typedef std::shared_ptr StatPtr; -typedef std::shared_ptr BarrierStatPtr; - -enum BarrierStatType { - BARRIER_END = 0, - BARRIER_DELTA = 1, -}; class StatSet { public: @@ -74,11 +67,8 @@ public: // print to LOG(INFO) void printSegTimerStatus(); - void printBarrierTimerStatus(); void printAllStatus(); - void printStatus(const std::string& name); - StatPtr getStat(const std::string& name) { { ReadLockGuard guard(lock_); @@ -93,12 +83,6 @@ public: return ret.first->second; } - BarrierStatPtr getStat(uint16_t numConnThreads, - const std::string& name, - BarrierStatType bType); - - void deleteStat(const std::string& name); - // true for showing stats for each thread // false for showing stats aggragated over threads void setThreadInfo(const std::string& name, bool flag); @@ -120,7 +104,6 @@ public: private: std::unordered_map statSet_; - std::unordered_map barrierStatSet_; const std::string name_; RWLock lock_; }; From 166dfbb085ef4ebbccea190abc436524fb80ed57 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 5 Jul 2017 02:36:10 +0000 Subject: [PATCH 49/79] fix cmake errors --- cmake/generic.cmake | 7 ++++--- go/cmd/master/CMakeLists.txt | 2 +- go/cmd/pserver/CMakeLists.txt | 2 +- go/pserver/optimizer.go | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index cae9524b2f..97196114ff 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -284,7 +284,7 @@ function(go_library TARGET_NAME) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND rm "${${TARGET_NAME}_LIB_PATH}" # Golang build source code - COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} + COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH} GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} -o "${${TARGET_NAME}_LIB_PATH}" "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" # must run under GOPATH @@ -300,10 +300,11 @@ function(go_binary TARGET_NAME) string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) add_custom_command(OUTPUT ${TARGET_NAME}_timestamp - COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build + COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH} + GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}" - WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") # TODO: don't know what ${TARGET_NAME}_link does add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS}) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin) diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt index 9e149967e7..1058ffa86b 100644 --- a/go/cmd/master/CMakeLists.txt +++ b/go/cmd/master/CMakeLists.txt @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -go_binary(master SRC master.go) +go_binary(master SRC master.go DEPS paddle_go_optimizer) diff --git a/go/cmd/pserver/CMakeLists.txt b/go/cmd/pserver/CMakeLists.txt index bc1da3348c..51db6dff04 100644 --- a/go/cmd/pserver/CMakeLists.txt +++ b/go/cmd/pserver/CMakeLists.txt @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -go_binary(pserver SRCS pserver.go) +go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer) diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go index d84f55b987..2c9b0d5652 100644 --- a/go/pserver/optimizer.go +++ b/go/pserver/optimizer.go @@ -2,7 +2,7 @@ package pserver // #cgo CFLAGS: -I ../../ // //FIXME: ldflags contain "build" path -// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm +// #cgo LDFLAGS: -lpaddle_go_optimizer -lstdc++ -lm // #include "paddle/optimizer/optimizer.h" // #include // #include From cd437f5072b0482685d107c386e587bc1fe59044 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 5 Jul 2017 05:16:41 +0000 Subject: [PATCH 50/79] fix bugs --- go/pserver/client/c/test/CMakeLists.txt | 4 +++- go/pserver/optimizer.go | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt index f287f85071..44bc183738 100644 --- a/go/pserver/client/c/test/CMakeLists.txt +++ b/go/pserver/client/c/test/CMakeLists.txt @@ -1,2 +1,4 @@ -cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient) +# FIXME:It's ugly +#cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer) +cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_optimizer paddle_proto glog gflags protobuf) add_style_check_target(test_cclient test_cclient.c) diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go index 2c9b0d5652..93389b93a7 100644 --- a/go/pserver/optimizer.go +++ b/go/pserver/optimizer.go @@ -1,7 +1,6 @@ package pserver // #cgo CFLAGS: -I ../../ -// //FIXME: ldflags contain "build" path // #cgo LDFLAGS: -lpaddle_go_optimizer -lstdc++ -lm // #include "paddle/optimizer/optimizer.h" // #include From 1409b17e4f20afdd922b8566be324581ed3f0e54 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 5 Jul 2017 06:06:13 +0000 Subject: [PATCH 51/79] add fixme --- cmake/generic.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 97196114ff..74396abdbb 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -281,6 +281,7 @@ function(go_library TARGET_NAME) file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go") string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + # FIXME: link path add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND rm "${${TARGET_NAME}_LIB_PATH}" # Golang build source code @@ -299,6 +300,7 @@ function(go_binary TARGET_NAME) cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + # FIXME: link path add_custom_command(OUTPUT ${TARGET_NAME}_timestamp COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH} GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build From 7364fcd4c3c6b08b569ed2bb809bed9904b55030 Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Wed, 5 Jul 2017 15:42:17 +0800 Subject: [PATCH 52/79] add golang precommit --- .pre-commit-config.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4cd8eb12f6..a7c450176d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,3 +21,10 @@ sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 hooks: - id: clang-formater +- repo: https://github.com/dnephin/pre-commit-golang + sha: e4693a4c282b4fc878eda172a929f7a6508e7d16 + hooks: + - id: go-fmt + - id: go-vet + - id: go-lint + - id: gometalinter From e7b071f33a2af3168586ef2710835b694f61e958 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 5 Jul 2017 15:55:26 +0800 Subject: [PATCH 53/79] update annotation with comments --- paddle/gserver/layers/AverageLayer.h | 4 ++++ paddle/gserver/layers/MaxLayer.h | 7 +++---- paddle/gserver/layers/SequenceLastInstanceLayer.cpp | 7 +++---- paddle/gserver/layers/SequencePoolLayer.h | 5 +++-- python/paddle/trainer_config_helpers/layers.py | 11 +++++++---- 5 files changed, 20 insertions(+), 14 deletions(-) diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h index 332552a304..db4a17bfb0 100644 --- a/paddle/gserver/layers/AverageLayer.h +++ b/paddle/gserver/layers/AverageLayer.h @@ -25,6 +25,10 @@ namespace paddle { * If SequenceLevel = kNonSeq: * Output: output size is the number of input sequences (NOT input instances) * output[i] = average_{for each instance in this sequence}{input[i]} + * If stride_ > 0: + * Output: a shorten sequence. Stride is the step size by which we slide a + * window upon the input sequence, and the average pooling + * operation is then applied to each interval independently. * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: output size is the number of input sub-sequences diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h index adf7ab4ae4..fa536fce2b 100644 --- a/paddle/gserver/layers/MaxLayer.h +++ b/paddle/gserver/layers/MaxLayer.h @@ -27,10 +27,9 @@ namespace paddle { * Output: output size is the number of input sequences (NOT input instances) * output[i] = max_{for each instance in this sequence}{input[i]} * If stride_ > 0: - * Output: a shorten sequence. The operation of getting max instance of a - * sequence is independently performed on every slice of the input - * sequence, which is obtained by sliding a window with the window - * size set to stride_. + * Output: a shorten sequence. Stride is the step size by which we slide a + * window upon the input sequence, and the max pooling operation is + * then applied to each interval independently. * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: output size is the number of input sub-sequences diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp index 8127cbf09c..323cc47df1 100644 --- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp +++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp @@ -26,10 +26,9 @@ namespace paddle { * If SequenceLevel = kNonseq: * Output: a sequence containing only the last instance of the input sequence * If stride_ > 0: - * Output: a shorten sequence. The operation of getting last instance of a - * sequence is independently performed on every slice of the input - * sequence, which is obtained by sliding a window with the window - * size set to stride_. + * Output: a shorten sequence. Stride is the step size by which we slide a + * window upon the input sequence, and getting last instance + * operation is then applied to each interval independently. * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: a sequence containing only the last instance of each sub-sequence diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h index 058627def8..e207afd1dc 100644 --- a/paddle/gserver/layers/SequencePoolLayer.h +++ b/paddle/gserver/layers/SequencePoolLayer.h @@ -28,8 +28,9 @@ namespace paddle { * sequence}{input[i]} * If stride_ > 0: * Check input sequence must not have sub-sequence - * Output: a shorten sequence, pooling is performed upon a small local - * area + * Output: a shorten sequence. Stride is the step size by which we slide + * a window upon the input sequence, and the pooling operation + * is then applied to each interval independently. * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: output size is the number of input sub-sequences diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 5e8bf4b203..2f52a27e60 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -1097,7 +1097,10 @@ def pooling_layer(input, If stride > 0, this layer slides a window whose size is determined by stride, and return the pooling value of the window as the output. Thus, a long sequence - will be shorten. Note that for sequence with sub-sequence, the default value + will be shorten. + + The parameter stride specifies the intervals at which to apply the pooling + operation. Note that for sequence with sub-sequence, the default value of stride is -1. The example usage is: @@ -1118,7 +1121,7 @@ def pooling_layer(input, :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling, SumPooling, SquareRootNPooling. :type pooling_type: BasePoolingType|None - :param stride: window size. + :param stride: The step size between successive pooling regions. :type stride: Int :param bias_attr: Bias parameter attribute. False if no bias. :type bias_attr: ParameterAttribute|None|False @@ -1408,7 +1411,7 @@ def last_seq(input, :type name: basestring :param input: Input layer name. :type input: LayerOutput - :param stride: window size. + :param stride: The step size between successive pooling regions. :type stride: Int :param layer_attr: extra layer attributes. :type layer_attr: ExtraLayerAttribute. @@ -1464,7 +1467,7 @@ def first_seq(input, :type name: basestring :param input: Input layer name. :type input: LayerOutput - :param stride: window size. + :param stride: The step size between successive pooling regions. :type stride: Int :param layer_attr: extra layer attributes. :type layer_attr: ExtraLayerAttribute. From 7ed6463ee91e0b71e7beca313554eae36da1c4e4 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 24 May 2017 13:55:58 +0800 Subject: [PATCH 54/79] fix bugs for CrossChannelNormLayer --- .../gserver/layers/CrossChannelNormLayer.cpp | 32 ++++++++++++++----- paddle/gserver/layers/NormLayer.cpp | 10 ------ paddle/gserver/tests/LayerGradUtil.cpp | 7 +++- paddle/gserver/tests/LayerGradUtil.h | 6 ++++ paddle/gserver/tests/test_LayerGrad.cpp | 5 ++- 5 files changed, 40 insertions(+), 20 deletions(-) diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp index 3fbccc1103..4dfe460561 100644 --- a/paddle/gserver/layers/CrossChannelNormLayer.cpp +++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp @@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data, data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_); } +bool CrossChannelNormLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + CHECK(parameters_[0]); + const NormConfig& conf = config_.inputs(0).norm_conf(); + channels_ = conf.channels(); + scale_.reset(new Weight(channels_, 1, parameters_[0])); + return true; +} + void CrossChannelNormLayer::forward(PassType passType) { Layer::forward(passType); MatrixPtr inV = getInputValue(0); @@ -63,6 +73,7 @@ void CrossChannelNormLayer::forward(PassType passType) { // compute norm. spatialBuffer_->sumCols(*dataTmp, 1, 0); + spatialBuffer_->add(*normTmp); spatialBuffer_->sqrt2(*spatialBuffer_); normTmp->copyFrom(*spatialBuffer_); outVTmp->copyFrom(*inVTmp); @@ -82,6 +93,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) { size_t dataDim = inG->getWidth(); size_t spatialDim = dataDim / channels_; + MatrixPtr inGBuffer; + Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_); + dataBuffer_->dotMul(*outG, *outV); Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_); Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_); @@ -100,22 +114,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) { scaleDiff_->add(*channelBuffer_, 1.); sampleBuffer_->dotMul(*inVTmp, *outGTmp); - spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.); + spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.); // scale the grad - inGTmp->copyFrom(*inVTmp); - inGTmp->mulRowVector(*spatialBuffer_); + inGBuffer->copyFrom(*inVTmp); + inGBuffer->mulRowVector(*spatialBuffer_); // divide by square of norm spatialBuffer_->dotMul(*normTmp, *normTmp); - inGTmp->divRowVector(*spatialBuffer_); + inGBuffer->divRowVector(*spatialBuffer_); // subtract - inGTmp->add(*outGTmp, -1, 1); + inGBuffer->add(*outGTmp, -1, 1); // divide by norm - inGTmp->divRowVector(*normTmp); + inGBuffer->divRowVector(*normTmp); // scale the diff - inGTmp->mulColVector(*scale_->getW()); + inGBuffer->mulColVector(*scale_->getW()); + + inGTmp->add(*inGBuffer); } // updata scale - if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_); + if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_); scale_->getParameterPtr()->incUpdate(callback); } diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp index e094078bfe..caef710092 100644 --- a/paddle/gserver/layers/NormLayer.cpp +++ b/paddle/gserver/layers/NormLayer.cpp @@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap, return true; } -bool CrossChannelNormLayer::init(const LayerMap& layerMap, - const ParameterMap& parameterMap) { - Layer::init(layerMap, parameterMap); - CHECK(parameters_[0]); - const NormConfig& conf = config_.inputs(0).norm_conf(); - channels_ = conf.channels(); - scale_.reset(new Weight(channels_, 1, parameters_[0])); - return true; -} - } // namespace paddle diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index e3591ba4df..66aafba844 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -465,7 +465,6 @@ void initTestLayer(TestConfig testConf, ParameterConfig paraConfig) { paraConfig.set_name(paraName); paraConfig.set_size(paraSize); - paraConfig.set_initial_std(1); paraConfig.set_is_static(isStatic); auto para = std::make_shared(paraConfig, FLAGS_use_gpu, initialize); @@ -499,6 +498,12 @@ void initTestLayer(TestConfig testConf, paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize()); paraConfig.add_dims(testConf.layerConfig.size()); } + if (testConf.hasParamInitialValue) { + paraConfig.set_initial_mean(testConf.paramInitialMean); + paraConfig.set_initial_std(testConf.paramInitialStd); + } else { + paraConfig.set_initial_std(1); + } initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig); } } diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index 18a6525a14..5ea7ca0f24 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -125,12 +125,18 @@ struct TestConfig { LayerConfig layerConfig; std::vector inputDefs; size_t biasSize; + real paramInitialMean; + real paramInitialStd; + bool hasParamInitialValue; bool testAccumulate; bool testState; bool staticBias; bool testBatchState; TestConfig() : biasSize(0), + paramInitialMean(0), + paramInitialStd(1), + hasParamInitialValue(false), testAccumulate(true), testState(false), staticBias(false), diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 59d1e9273d..6441e08b48 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1661,6 +1661,9 @@ TEST(Layer, PadLayer) { TEST(Layer, CrossChannelNormLayer) { TestConfig config; + config.hasParamInitialValue = true; + config.paramInitialMean = 1.; + config.paramInitialStd = 0.; config.layerConfig.set_type("norm"); config.layerConfig.set_size(100); LayerInputConfig* input = config.layerConfig.add_inputs(); @@ -1674,7 +1677,7 @@ TEST(Layer, CrossChannelNormLayer) { config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10}); for (auto useGpu : {false, true}) { - testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5); + testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false); } } From 2bf4f1bbc1e4abc9c173b89aeb96c40b404e94f4 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 24 May 2017 14:22:41 +0800 Subject: [PATCH 55/79] make adding eps more clear --- paddle/gserver/layers/CrossChannelNormLayer.cpp | 7 +++---- paddle/gserver/tests/LayerGradUtil.h | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp index 4dfe460561..d72503217f 100644 --- a/paddle/gserver/layers/CrossChannelNormLayer.cpp +++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp @@ -61,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) { Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_); Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_); Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_); - normBuffer_->zeroMem(); - // add eps to avoid overflow - normBuffer_->addScalar(*normBuffer_, 1e-6); + inV->square2(*dataBuffer_); for (size_t i = 0; i < batchSize; i++) { const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim); @@ -73,7 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) { // compute norm. spatialBuffer_->sumCols(*dataTmp, 1, 0); - spatialBuffer_->add(*normTmp); + // add eps to avoid overflow + spatialBuffer_->add(1e-6); spatialBuffer_->sqrt2(*spatialBuffer_); normTmp->copyFrom(*spatialBuffer_); outVTmp->copyFrom(*inVTmp); diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index 5ea7ca0f24..9dbd202757 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -134,8 +134,8 @@ struct TestConfig { bool testBatchState; TestConfig() : biasSize(0), - paramInitialMean(0), - paramInitialStd(1), + paramInitialMean(0.0), + paramInitialStd(1.0), hasParamInitialValue(false), testAccumulate(true), testState(false), From 7c6aa04f6185e92082b9a742d5c746b335406711 Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Wed, 5 Jul 2017 16:24:53 +0800 Subject: [PATCH 56/79] add go pre-commit and travis build --- .pre-commit-config.yaml | 4 ++-- .travis.yml | 4 ++-- go/pserver/service.go | 6 ++++-- paddle/scripts/travis/build_doc.sh | 11 ++++++----- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7c450176d..61b989dc69 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,6 +25,6 @@ sha: e4693a4c282b4fc878eda172a929f7a6508e7d16 hooks: - id: go-fmt - - id: go-vet + files: (.*\.go) - id: go-lint - - id: gometalinter + files: (.*\.go) diff --git a/.travis.yml b/.travis.yml index 16432dac0c..aafeeba027 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,7 +33,7 @@ addons: - ccache before_install: - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python + # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - pip install rarfile @@ -42,7 +42,7 @@ before_install: function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - | - export WITH_GOLANG=ON && timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout + timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; notifications: email: diff --git a/go/pserver/service.go b/go/pserver/service.go index 7711dc027e..ad16a5708d 100644 --- a/go/pserver/service.go +++ b/go/pserver/service.go @@ -10,8 +10,10 @@ import ( type ElementType int const ( + // AlreadyInitialized is true if pserver is initialized AlreadyInitialized = "pserver already initialized" - Uninitialized = "pserver not fully initialized" + // Uninitialized is true if pserver not fully initialized + Uninitialized = "pserver not fully initialized" ) // Supported element types @@ -55,7 +57,7 @@ func NewService(idx int) (*Service, error) { s := &Service{ idx: idx, } - s.optMap = make(map[string]*optimizer) + s.optMap = make(map[string]*optimizer) s.initialized = make(chan struct{}) return s, nil } diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index a44bd35357..a443851580 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -5,13 +5,14 @@ set -e mkdir -p $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build -# Compile Documentation only. -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF +# Compile paddle binaries first +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF mkdir output make -j `nproc` find .. -name '*whl' | xargs pip install # install all wheels. rm -rf * +# Compile Documentation only. cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON make -j `nproc` paddle_docs paddle_docs_cn @@ -25,7 +26,7 @@ SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:} SHA=`git rev-parse --verify HEAD` # Documentation branch name -# gh-pages branch is used for PaddlePaddle.org. The English version of +# gh-pages branch is used for PaddlePaddle.org. The English version of # documentation in `doc` directory, and the chinese version in `doc_cn` # directory. TARGET_BRANCH="gh-pages" @@ -51,7 +52,7 @@ function deploy_docs() { # checkout github page branch git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH - + mkdir -p ${DIR} # remove old docs. mv new docs. set +e @@ -62,7 +63,7 @@ function deploy_docs() { git add . } -deploy_docs "master" "." +deploy_docs "master" "." deploy_docs "develop" "./develop/" # Check is there anything changed. From 81bfd47eb3fdbf7a0c398f6ad7e62f1d6e7350c1 Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Wed, 5 Jul 2017 16:32:14 +0800 Subject: [PATCH 57/79] add glide in travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index aafeeba027..498674469b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,6 +37,7 @@ before_install: # protobuf version. - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - pip install rarfile + - curl https://glide.sh/get | bash - eval "$(GIMME_GO_VERSION=1.8.3 gimme)" - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } From 2f085a7bcf11f5501bded27862988022e32299a0 Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Wed, 5 Jul 2017 17:08:19 +0800 Subject: [PATCH 58/79] add go pserver deps --- go/cmd/pserver/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/cmd/pserver/CMakeLists.txt b/go/cmd/pserver/CMakeLists.txt index bc1da3348c..51db6dff04 100644 --- a/go/cmd/pserver/CMakeLists.txt +++ b/go/cmd/pserver/CMakeLists.txt @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -go_binary(pserver SRCS pserver.go) +go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer) From 5eb8bf0324ba7de923760dc05aa7e850a9ae103f Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 5 Jul 2017 17:23:41 +0800 Subject: [PATCH 59/79] Correct GLOG CHECK in Paddle Use CHECK instead of PCHECK, because PCHECK is used for errno. --- paddle/pserver/LightNetwork.cpp | 28 ++++++++++++++-------------- paddle/pserver/SocketChannel.cpp | 22 +++++++++++----------- paddle/pserver/test/SocketTest.cpp | 28 ++++++++++++++-------------- paddle/trainer/Tester.cpp | 2 +- paddle/utils/ThreadLocal.h | 12 ++++++------ 5 files changed, 46 insertions(+), 46 deletions(-) diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp index 922f25734d..8616fd2d5a 100644 --- a/paddle/pserver/LightNetwork.cpp +++ b/paddle/pserver/LightNetwork.cpp @@ -142,7 +142,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu) } /// trigger to initialize RDMA lib - PCHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n"; + CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n"; } SocketServer::~SocketServer() { @@ -168,7 +168,7 @@ void SocketServer::tcpServer() { /// First call to socket() function socket_ = socket(AF_INET, SOCK_STREAM, 0); - PCHECK(socket_ >= 0) << "ERROR opening socket"; + CHECK(socket_ >= 0) << "ERROR opening socket"; /// Initialize socket structure bzero((char *)&serv_addr, sizeof(serv_addr)); @@ -176,7 +176,7 @@ void SocketServer::tcpServer() { serv_addr.sin_port = htons(port_); if (!addr_.empty()) { server = gethostbyname(addr_.c_str()); - PCHECK(server) << "ERROR, no such host: " << addr_; + CHECK(server) << "ERROR, no such host: " << addr_; bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr, server->h_length); @@ -187,7 +187,7 @@ void SocketServer::tcpServer() { setOption(socket_); /// Now bind the host address using bind() call. - PCHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0) + CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0) << "ERROR on binding " << addr_; /// Now start listening for the clients, here process will @@ -201,7 +201,7 @@ void SocketServer::tcpServer() { if (stopping_) { break; } - PCHECK(newsockfd >= 0) << "ERROR on accept"; + CHECK(newsockfd >= 0) << "ERROR on accept"; constexpr int kPeerNameLen = 128; char peerName[kPeerNameLen]; CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen)); @@ -227,14 +227,14 @@ void SocketServer::rdmaServer() { /// First call to socket() function rdmaSocket_ = rdma::ssocket(rdmaCpu_); - PCHECK(rdmaSocket_) << "ERROR opening RDMA socket"; + CHECK(rdmaSocket_) << "ERROR opening RDMA socket"; - PCHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0) + CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0) << "ERROR bind RDMA socket"; /// Now start listening for the clients, here process will /// go in sleep mode and will wait for the incoming connection - PCHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket"; + CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket"; while (true) { /// Accept actual connection from the client @@ -242,7 +242,7 @@ void SocketServer::rdmaServer() { if (stopping_) { break; } - PCHECK(newsock) << "ERROR on accept"; + CHECK(newsock) << "ERROR on accept"; constexpr int kPeerNameLen = 128; char peerName[kPeerNameLen]; @@ -290,7 +290,7 @@ RdmaClientDaemons::RdmaClientDaemons() { onlineCpus_ = rdma::numCpus(); for (auto i = 0; i < onlineCpus_; i++) { socket = rdma::csocket(i); - PCHECK(socket) << "ERROR open client socket daemon"; + CHECK(socket) << "ERROR open client socket daemon"; rdmaClientSocket_.push_back(socket); } @@ -355,7 +355,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { /// Create a socket point int sockfd = socket(AF_INET, SOCK_STREAM, 0); - PCHECK(sockfd >= 0) << "ERROR opening socket"; + CHECK(sockfd >= 0) << "ERROR opening socket"; #if defined(__OSX__) || defined(__APPLE__) server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet); @@ -396,8 +396,8 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { } std::this_thread::sleep_for(std::chrono::seconds(1)); } else { - PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":" - << serverPort << "errorno: " << errno; + CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":" + << serverPort << "errorno: " << errno; } } while (errno == ECONNREFUSED); @@ -426,7 +426,7 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) { /// connect to server with socket daemon sock = rdma::connect(socketDaemon_, rdmaUri.c_str()); - PCHECK(sock) << "ERROR connect to server" << rdmaUri; + CHECK(sock) << "ERROR connect to server" << rdmaUri; std::vector seg; str::split(rdmaUri, '/', &seg); diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp index 0599889164..12e3bc6552 100644 --- a/paddle/pserver/SocketChannel.cpp +++ b/paddle/pserver/SocketChannel.cpp @@ -51,7 +51,7 @@ size_t SocketChannel::read(void* buf, size_t size) { else len = rdma::read(rdmaSocket_, (char*)buf + total, size - total); - PCHECK(len >= 0) << " peer=" << peerName_; + CHECK(len >= 0) << " peer=" << peerName_; if (len <= 0) { return total; } @@ -69,7 +69,7 @@ size_t SocketChannel::write(const void* buf, size_t size) { else len = rdma::write(rdmaSocket_, (char*)buf + total, size - total); - PCHECK(len >= 0) << " peer=" << peerName_; + CHECK(len >= 0) << " peer=" << peerName_; if (len <= 0) { return total; } @@ -98,10 +98,10 @@ static size_t readwritev(IOFunc iofunc, while (size < total) { ssize_t len = iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs)); - PCHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov - << " iovCnt=" << iovcnt - << " iovs[curIov].base=" << iovs[curIov].iov_base - << " iovs[curIov].iov_len=" << iovs[curIov].iov_len; + CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov + << " iovCnt=" << iovcnt + << " iovs[curIov].base=" << iovs[curIov].iov_base + << " iovs[curIov].iov_len=" << iovs[curIov].iov_len; size += len; /// restore iovs[curIov] to the original value @@ -183,7 +183,7 @@ void SocketChannel::writeMessage(const std::vector& userIovs) { header.totalLength += iov.iov_len; } - PCHECK(writev(iovs) == (size_t)header.totalLength); + CHECK(writev(iovs) == (size_t)header.totalLength); } std::unique_ptr SocketChannel::readMessage() { @@ -194,7 +194,7 @@ std::unique_ptr SocketChannel::readMessage() { return nullptr; } - PCHECK(len == sizeof(header)); + CHECK(len == sizeof(header)); std::unique_ptr msgReader(new MsgReader(this, header.numIovs)); @@ -209,7 +209,7 @@ std::unique_ptr SocketChannel::readMessage() { MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks) : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) { size_t size = numBlocks * sizeof(blockLengths_[0]); - PCHECK(channel_->read(&blockLengths_[0], size) == size); + CHECK(channel_->read(&blockLengths_[0], size) == size); } void MsgReader::readBlocks(const std::vector& bufs) { @@ -223,12 +223,12 @@ void MsgReader::readBlocks(const std::vector& bufs) { ++currentBlockIndex_; } - PCHECK(channel_->readv(&iovs) == totalLength); + CHECK(channel_->readv(&iovs) == totalLength); } void MsgReader::readNextBlock(void* buf) { CHECK_LT(currentBlockIndex_, blockLengths_.size()); - PCHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength()); + CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength()); ++currentBlockIndex_; } diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp index 066a6c0293..6f6c9e596c 100644 --- a/paddle/pserver/test/SocketTest.cpp +++ b/paddle/pserver/test/SocketTest.cpp @@ -113,7 +113,7 @@ void SocketServer::run() { /* First call to socket() function */ socket_ = socket(AF_INET, SOCK_STREAM, 0); - PCHECK(socket_ >= 0) << "ERROR opening socket"; + CHECK(socket_ >= 0) << "ERROR opening socket"; /* Initialize socket structure */ bzero((char*)&serv_addr, sizeof(serv_addr)); @@ -122,7 +122,7 @@ void SocketServer::run() { serv_addr.sin_port = htons(port_); /* Now bind the host address using bind() call.*/ - PCHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0) + CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0) << "ERROR on binding"; /* Now start listening for the clients, here process will @@ -134,7 +134,7 @@ void SocketServer::run() { while (true) { /* Accept actual connection from the client */ newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen); - PCHECK(newsockfd >= 0) << "ERROR on accept"; + CHECK(newsockfd >= 0) << "ERROR on accept"; SocketWorker* worker = new SocketWorker(newsockfd); worker->start(); @@ -146,17 +146,17 @@ void SocketWorker::run() { while (true) { int64_t n = channel_.readAll(&header, sizeof(header)); - PCHECK(n == sizeof(header)) << "ERROR reading from socket"; + CHECK(n == sizeof(header)) << "ERROR reading from socket"; buffer_.resize(header.dataLength); n = channel_.readAll(&buffer_[0], header.dataLength); - PCHECK(n == header.dataLength) << "ERROR reading from socket"; + CHECK(n == header.dataLength) << "ERROR reading from socket"; /* Write a response to the client */ n = channel_.writeAll(&header, sizeof(header)); - PCHECK(n == sizeof(header)) << "ERROR reading from socket"; + CHECK(n == sizeof(header)) << "ERROR reading from socket"; n = channel_.writeAll(buffer_.data(), buffer_.size()); - PCHECK(n == header.dataLength) << "ERROR writing to socket"; + CHECK(n == header.dataLength) << "ERROR writing to socket"; } } @@ -177,9 +177,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) { /* Create a socket point */ int sockfd = socket(AF_INET, SOCK_STREAM, 0); - PCHECK(sockfd >= 0) << "ERROR opening socket"; + CHECK(sockfd >= 0) << "ERROR opening socket"; server = gethostbyname(serverAddr.c_str()); - PCHECK(server) << "ERROR, no such host: " << serverAddr; + CHECK(server) << "ERROR, no such host: " << serverAddr; bzero((char*)&serv_addr, sizeof(serv_addr)); serv_addr.sin_family = AF_INET; @@ -189,7 +189,7 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) { serv_addr.sin_port = htons(serverPort); /* Now connect to the server */ - PCHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0) + CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0) << "ERROR connecting"; channel_.reset(new SocketChannel(sockfd)); @@ -234,18 +234,18 @@ int main(int argc, char** argv) { cpuGrad.copyFrom(gpuGrad); header.dataLength = dataSize; - PCHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header)) + CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header)) << "Client write header error"; - PCHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize) + CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize) << "Client write data error"; /* Now read server response */ - PCHECK(channel->readAll(&header, sizeof(header)) == sizeof(header)) + CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header)) << "Client read header error"; CHECK_EQ((uint64_t)header.dataLength, dataSize); - PCHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize) + CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize) << "Client read data error"; gpuParam.copyFrom(cpuParam); diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp index 80664fa877..16e676d602 100644 --- a/paddle/trainer/Tester.cpp +++ b/paddle/trainer/Tester.cpp @@ -175,7 +175,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch, } hl_stream_synchronize(HPPL_STREAM_DEFAULT); FILE* fp = fopen(featFile.c_str(), "ab+"); - PCHECK(!ferror(fp)) << "Fail to open " << featFile; + CHECK(!ferror(fp)) << "Fail to open " << featFile; size_t sampleNum = featMatrices[0]->getHeight(); for (size_t i = 0; i < sampleNum; ++i) { diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h index a4987c9ec2..b5e2862546 100644 --- a/paddle/utils/ThreadLocal.h +++ b/paddle/utils/ThreadLocal.h @@ -51,7 +51,7 @@ template class ThreadLocal { public: ThreadLocal() { - PCHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0); + CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0); } ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); } @@ -65,7 +65,7 @@ public: if (!p && createLocal) { p = new T(); int ret = pthread_setspecific(threadSpecificKey_, p); - PCHECK(ret == 0); + CHECK(ret == 0); } return p; } @@ -79,7 +79,7 @@ public: if (T* q = get(false)) { dataDestructor(q); } - PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0); + CHECK(pthread_setspecific(threadSpecificKey_, p) == 0); } /** @@ -112,7 +112,7 @@ private: template class ThreadLocalD { public: - ThreadLocalD() { PCHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); } + ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); } ~ThreadLocalD() { pthread_key_delete(threadSpecificKey_); for (auto t : threadMap_) { @@ -127,7 +127,7 @@ public: T* p = (T*)pthread_getspecific(threadSpecificKey_); if (!p) { p = new T(); - PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0); + CHECK(pthread_setspecific(threadSpecificKey_, p) == 0); updateMap(p); } return p; @@ -141,7 +141,7 @@ public: if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) { dataDestructor(q); } - PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0); + CHECK(pthread_setspecific(threadSpecificKey_, p) == 0); updateMap(p); } From 2e302085d7c0a79a8516533b29450a1febc25d79 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 5 Jul 2017 09:49:30 +0000 Subject: [PATCH 60/79] fix bugs --- cmake/generic.cmake | 4 ++-- go/pserver/client/c/CMakeLists.txt | 3 ++- go/pserver/client/c/test/CMakeLists.txt | 4 +--- paddle/api/CMakeLists.txt | 1 + 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 74396abdbb..d51b95a5d7 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -192,9 +192,9 @@ function(cc_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main -lstdc++ -lm) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) - add_test(${TARGET_NAME} ${TARGET_NAME}) + add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endfunction(cc_test) diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt index a3fcaeef19..d2ac20e25c 100644 --- a/go/pserver/client/c/CMakeLists.txt +++ b/go/pserver/client/c/CMakeLists.txt @@ -1,5 +1,6 @@ cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf) go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer) if(WITH_TESTING) - add_subdirectory(test) + # TODO: add unit test + #add_subdirectory(test) endif() diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt index 44bc183738..dce8645ce7 100644 --- a/go/pserver/client/c/test/CMakeLists.txt +++ b/go/pserver/client/c/test/CMakeLists.txt @@ -1,4 +1,2 @@ -# FIXME:It's ugly -#cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer) -cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_optimizer paddle_proto glog gflags protobuf) +cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer) add_style_check_target(test_cclient test_cclient.c) diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index 39d8aa075b..84da89a142 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle paddle_trainer_lib paddle_network paddle_parameter + paddle_optimizer paddle_math paddle_utils paddle_proto From 204869c2dae9b03b1155be106484ef328e942132 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 5 Jul 2017 10:10:18 +0000 Subject: [PATCH 61/79] fix bugs --- paddle/scripts/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 1ccee686df..ab60f1a38d 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -78,7 +78,7 @@ paddle version # PaddlePaddle. This awkwardness is due to # https://github.com/PaddlePaddle/Paddle/issues/1854. It also # describes a solution. -if [ ${WITH_DOC} == "ON" ]; then +if [[ ${WITH_DOC} == "ON" ]]; then cat < Date: Wed, 5 Jul 2017 18:18:32 +0800 Subject: [PATCH 62/79] fix auto cgo LDFLAGS --- go/pserver/optimizer.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go index d84f55b987..54d1082094 100644 --- a/go/pserver/optimizer.go +++ b/go/pserver/optimizer.go @@ -2,7 +2,7 @@ package pserver // #cgo CFLAGS: -I ../../ // //FIXME: ldflags contain "build" path -// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm +// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm // #include "paddle/optimizer/optimizer.h" // #include // #include @@ -56,8 +56,8 @@ func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer { func (o *optimizer) GetWeights() []byte { var buffer unsafe.Pointer - buffer_len := C.paddle_optimizer_get_weights(o.opt, &buffer) - return cArrayToSlice(buffer, int(buffer_len)*C.sizeof_float) + bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer) + return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float) } func (o *optimizer) UpdateParameter(g Gradient) error { From c37da0bd3ba14318198bfc6dd8f8ba5e13c1a269 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 5 Jul 2017 18:36:47 +0800 Subject: [PATCH 63/79] Remove hasParamInitialValue flag. --- paddle/gserver/tests/LayerGradUtil.cpp | 9 +++------ paddle/gserver/tests/LayerGradUtil.h | 2 -- paddle/gserver/tests/test_LayerGrad.cpp | 1 - 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index 66aafba844..15b8cedeb8 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -498,12 +498,9 @@ void initTestLayer(TestConfig testConf, paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize()); paraConfig.add_dims(testConf.layerConfig.size()); } - if (testConf.hasParamInitialValue) { - paraConfig.set_initial_mean(testConf.paramInitialMean); - paraConfig.set_initial_std(testConf.paramInitialStd); - } else { - paraConfig.set_initial_std(1); - } + CHECK_GE(testConf.paramInitialStd, 0); + paraConfig.set_initial_mean(testConf.paramInitialMean); + paraConfig.set_initial_std(testConf.paramInitialStd); initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig); } } diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index 9dbd202757..d299b4dd09 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -127,7 +127,6 @@ struct TestConfig { size_t biasSize; real paramInitialMean; real paramInitialStd; - bool hasParamInitialValue; bool testAccumulate; bool testState; bool staticBias; @@ -136,7 +135,6 @@ struct TestConfig { : biasSize(0), paramInitialMean(0.0), paramInitialStd(1.0), - hasParamInitialValue(false), testAccumulate(true), testState(false), staticBias(false), diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 6441e08b48..bf0136a10f 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1661,7 +1661,6 @@ TEST(Layer, PadLayer) { TEST(Layer, CrossChannelNormLayer) { TestConfig config; - config.hasParamInitialValue = true; config.paramInitialMean = 1.; config.paramInitialStd = 0.; config.layerConfig.set_type("norm"); From b68e90be820f7a925e114f76f27156e728fc9e79 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Wed, 5 Jul 2017 21:30:28 +0800 Subject: [PATCH 64/79] fix go test building --- go/pserver/client/c/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt index a3fcaeef19..34aa7ca5ff 100644 --- a/go/pserver/client/c/CMakeLists.txt +++ b/go/pserver/client/c/CMakeLists.txt @@ -1,4 +1,5 @@ cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf) +target_link_libraries(paddle_go_optimizer stdc++ m) go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer) if(WITH_TESTING) add_subdirectory(test) From 78f1274d6e2c75d0036ae2a7da6cbccfc844b8f0 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Wed, 5 Jul 2017 21:40:12 +0800 Subject: [PATCH 65/79] remove unnessesary cc_test link --- cmake/generic.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index d51b95a5d7..c2962e35ef 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -192,7 +192,7 @@ function(cc_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main -lstdc++ -lm) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() @@ -285,7 +285,7 @@ function(go_library TARGET_NAME) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND rm "${${TARGET_NAME}_LIB_PATH}" # Golang build source code - COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH} GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} + COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} -o "${${TARGET_NAME}_LIB_PATH}" "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" # must run under GOPATH From 4d2a83c750c6168d16a4ee302b0c69e553bd0b34 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Wed, 5 Jul 2017 21:58:46 +0800 Subject: [PATCH 66/79] update again --- go/pserver/client/c/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt index f287f85071..dce8645ce7 100644 --- a/go/pserver/client/c/test/CMakeLists.txt +++ b/go/pserver/client/c/test/CMakeLists.txt @@ -1,2 +1,2 @@ -cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient) +cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer) add_style_check_target(test_cclient test_cclient.c) From 7848a3fb5c6de5c21a6c1c34a9d12e8e866c760c Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Thu, 6 Jul 2017 09:45:01 +0800 Subject: [PATCH 67/79] remove cclient test --- go/pserver/client/c/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt index a3fcaeef19..d5c1ed38e5 100644 --- a/go/pserver/client/c/CMakeLists.txt +++ b/go/pserver/client/c/CMakeLists.txt @@ -1,5 +1,7 @@ cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf) go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer) if(WITH_TESTING) - add_subdirectory(test) + # FIXME: this test requires pserver which is not managed by the test + # we need some kind of e2e testing machanism. + # add_subdirectory(test) endif() From d6ecae779a28d51e669a4c029d00ec57a98f2bc8 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 11:25:28 +0800 Subject: [PATCH 68/79] FIX: propagation dependencies and out of date rebuild --- cmake/generic.cmake | 51 ++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index cae9524b2f..87d8caaec4 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -99,15 +99,37 @@ function(merge_static_libs TARGET_NAME) set(libs ${ARGN}) list(REMOVE_DUPLICATES libs) - # First get the file names of the libraries to be merged + # Get all propagation dependencies from the merged libraries foreach(lib ${libs}) + list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) + endforeach() + + # To produce a library we need at least one source file. + # It is created by add_custom_command below and will helps + # also help to track dependencies. + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${dummyfile} + COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile} + DEPENDS ${libs}) + + # Generate dummy staic lib + file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") + add_library(${TARGET_NAME} STATIC ${dummyfile}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + foreach(lib ${libs}) + # Get the file names of the libraries to be merged set(libfiles ${libfiles} $) endforeach() + # Get the file name of the generated library + set(outlibfile "$") + if(APPLE) # Use OSX's libtool to merge archives - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) - file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") - add_library(${TARGET_NAME} STATIC ${dummyfile}) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}) @@ -117,7 +139,8 @@ function(merge_static_libs TARGET_NAME) set(objdir ${lib}.objdir) add_custom_command(OUTPUT ${objdir} - COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}) + COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir} + DEPENDS ${lib}) add_custom_command(OUTPUT ${objlistfile} COMMAND ${CMAKE_AR} -x "$" @@ -125,23 +148,9 @@ function(merge_static_libs TARGET_NAME) DEPENDS ${lib} ${objdir} WORKING_DIRECTORY ${objdir}) - # Empty dummy source file that goes into merged library - set(mergebase ${lib}.mergebase.c) - add_custom_command(OUTPUT ${mergebase} - COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} - DEPENDS ${objlistfile}) - - list(APPEND mergebases "${mergebase}") - endforeach() - - # We need a target for the output merged library - add_library(${TARGET_NAME} STATIC ${mergebases}) - set(outlibfile "$") - - foreach(lib ${libs}) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist" - WORKING_DIRECTORY ${lib}.objdir) + COMMAND ${CMAKE_AR} ru ${outlibfile} *.o + WORKING_DIRECTORY ${objdir}) endforeach() add_custom_command(TARGET ${TARGET_NAME} POST_BUILD From 3e4ba647eec7bc16511e1146d5a696cd124c6a27 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 11:28:52 +0800 Subject: [PATCH 69/79] FIX: remove duplicate --- cmake/generic.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 87d8caaec4..1a4600ef4b 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -103,6 +103,7 @@ function(merge_static_libs TARGET_NAME) foreach(lib ${libs}) list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) endforeach() + list(REMOVE_DUPLICATES libs_deps) # To produce a library we need at least one source file. # It is created by add_custom_command below and will helps From 1b366dc2fff2b896fc92c1aa161183e6c88f6b7e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 6 Jul 2017 14:44:40 +0800 Subject: [PATCH 70/79] Fix CI error on test_LayerGrad.LSTM * We should not EXPECT_EQ between a float value and a int value. Use ASSERT_NEAR instead. --- paddle/gserver/tests/LayerGradUtil.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index 15b8cedeb8..9eca58f1a1 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer, std::vector args; args.push_back(out); - EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed"; + ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed"; for (size_t seqId = 0; seqId < numSequences; ++seqId) { start[seqId] += seqLens[seqId]; } From e2ea1f42e9202e5591e2de1ce5f96c573dcc6484 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 6 Jul 2017 14:12:45 +0800 Subject: [PATCH 71/79] Generate python protobufs for paddle.v2.framework Python should be able to manipulate Protobuf message because: 1. Python's `create_op_creation_methods` take the `OpProto` array to generate all `op_creation_methods` in RunTime. 2. All `op_creation_methods` will create an `OpDesc` and pass it to Paddle C++ method `CreateOp` and return the Op handle. Here is the list of what is added in this commit: * Add `protobuf_generate_python` if it is not defined. * Before cmake 3.4, `protobuf_generate_python` is not defined. Just copy the implementation of that function in `protobuf.cmake` * Add `py_proto_compile` function in `cmake/generic.cmake`. * It follows bazel's API interface. * https://github.com/pubref/rules_protobuf#rules * Add an empty package named `paddle.v2.framework`, all python code of `paddle::framework` will be in that package. * Generate protobuf's python module `__init__.py` by `touch` while compiling. * Change setup.py.in, make `paddle.v2.framework.proto` uses the generated protobuf pythons. --- cmake/external/protobuf.cmake | 59 +++++++++++++++++++ cmake/generic.cmake | 9 +++ paddle/framework/CMakeLists.txt | 5 +- python/CMakeLists.txt | 3 +- python/paddle/v2/framework/__init__.py | 1 + .../paddle/v2/framework/tests/CMakeLists.txt | 1 + .../v2/framework/tests/test_protobuf.py | 26 ++++++++ python/setup.py.in | 9 ++- 8 files changed, 109 insertions(+), 4 deletions(-) create mode 100644 python/paddle/v2/framework/__init__.py create mode 100644 python/paddle/v2/framework/tests/CMakeLists.txt create mode 100644 python/paddle/v2/framework/tests/test_protobuf.py diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 3c74944bc2..e629d61585 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -17,6 +17,65 @@ INCLUDE(ExternalProject) FIND_PACKAGE(Protobuf QUIET) SET(PROTOBUF_FOUND "OFF") +if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. + function(protobuf_generate_python SRCS) + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + endfunction() +endif() # Print and set the protobuf library information, # finish this cmake process and exit from this file. diff --git a/cmake/generic.cmake b/cmake/generic.cmake index d51b95a5d7..a92671ae62 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -335,3 +335,12 @@ function(proto_library TARGET_NAME) protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) endfunction() + +function(py_proto_compile TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS) + cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(py_srcs) + protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) + add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) +endfunction() \ No newline at end of file diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index dcd70d2851..970b2b9abd 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -9,6 +9,9 @@ cc_test(enforce_test SRCS enforce_test.cc) proto_library(attr_type SRCS attr_type.proto) proto_library(op_proto SRCS op_proto.proto DEPS attr_type) cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf) - proto_library(op_desc SRCS op_desc.proto DEPS attr_type) cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf) +py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto) +# Generate an empty __init__.py to make framework_py_proto as a valid python module. +add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) +add_dependencies(framework_py_proto framework_py_proto_init) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 361e764e25..13a1802ee3 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -29,7 +29,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp - DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) + DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) add_custom_target(paddle_python ALL DEPENDS ${OUTPUT_DIR}/.timestamp) @@ -43,6 +43,7 @@ if (WITH_TESTING) add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/reader/tests) add_subdirectory(paddle/v2/plot/tests) + add_subdirectory(paddle/v2/framework/tests) endif() endif() install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} diff --git a/python/paddle/v2/framework/__init__.py b/python/paddle/v2/framework/__init__.py new file mode 100644 index 0000000000..c942373c66 --- /dev/null +++ b/python/paddle/v2/framework/__init__.py @@ -0,0 +1 @@ +__all__ = ['proto'] diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt new file mode 100644 index 0000000000..8cb0c5c376 --- /dev/null +++ b/python/paddle/v2/framework/tests/CMakeLists.txt @@ -0,0 +1 @@ +add_python_test(test_framework test_protobuf.py) diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py new file mode 100644 index 0000000000..f0e6019199 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_protobuf.py @@ -0,0 +1,26 @@ +import paddle.v2.framework.proto.op_proto_pb2 +import paddle.v2.framework.proto.attr_type_pb2 +import unittest + + +class TestFrameworkProto(unittest.TestCase): + def test_all(self): + op_proto_lib = paddle.v2.framework.proto.op_proto_pb2 + attr_type_lib = paddle.v2.framework.proto.attr_type_pb2 + op_proto = op_proto_lib.OpProto() + ipt0 = op_proto.inputs.add() + ipt0.name = "a" + ipt0.comment = "the input of cosine op" + ipt1 = op_proto.inputs.add() + ipt1.name = "b" + ipt1.comment = "the other input of cosine op" + opt = op_proto.outputs.add() + opt.name = "output" + opt.comment = "the output of cosine op" + op_proto.comment = "cosine op, output = scale*cos(a, b)" + attr = op_proto.attrs.add() + attr.name = "scale" + attr.comment = "scale of cosine op" + attr.type = attr_type_lib.FLOAT + op_proto.type = "cos" + self.assertTrue(op_proto.IsInitialized()) diff --git a/python/setup.py.in b/python/setup.py.in index dae0166487..78423614a6 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -9,7 +9,9 @@ packages=['paddle', 'paddle.v2.dataset', 'paddle.v2.reader', 'paddle.v2.master', - 'paddle.v2.plot'] + 'paddle.v2.plot', + 'paddle.v2.framework', + 'paddle.v2.framework.proto'] setup_requires=["requests", "numpy", @@ -29,6 +31,9 @@ setup(name='paddle', packages=packages, package_data={'paddle.v2.master': ['${paddle_master_LIB_NAME}'], }, package_dir={ - '': '${CMAKE_CURRENT_SOURCE_DIR}' + '': '${CMAKE_CURRENT_SOURCE_DIR}', + # The paddle.v2.framework.proto will be generated while compiling. + # So that package points to other directory. + 'paddle.v2.framework.proto': '${CMAKE_BINARY_DIR}/paddle/framework' }, ) From 847535f4fe6cea0b954a67fffea4c7b9ed96bd77 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 15:42:29 +0800 Subject: [PATCH 72/79] FIX: propagation dependencies under linux --- cmake/generic.cmake | 69 +++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 1a4600ef4b..3900ea2604 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -103,38 +103,33 @@ function(merge_static_libs TARGET_NAME) foreach(lib ${libs}) list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) endforeach() - list(REMOVE_DUPLICATES libs_deps) - # To produce a library we need at least one source file. - # It is created by add_custom_command below and will helps - # also help to track dependencies. - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) - - # Make the generated dummy source file depended on all static input - # libs. If input lib changes,the source file is touched - # which causes the desired effect (relink). - add_custom_command(OUTPUT ${dummyfile} - COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile} - DEPENDS ${libs}) - - # Generate dummy staic lib - file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") - add_library(${TARGET_NAME} STATIC ${dummyfile}) - target_link_libraries(${TARGET_NAME} ${libs_deps}) + if(APPLE) # Use OSX's libtool to merge archives + # To produce a library we need at least one source file. + # It is created by add_custom_command below and will helps + # also help to track dependencies. + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) - foreach(lib ${libs}) - # Get the file names of the libraries to be merged - set(libfiles ${libfiles} $) - endforeach() + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${dummyfile} + COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile} + DEPENDS ${libs}) - # Get the file name of the generated library - set(outlibfile "$") + # Generate dummy staic lib + file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") + add_library(${TARGET_NAME} STATIC ${dummyfile}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) - if(APPLE) # Use OSX's libtool to merge archives + foreach(lib ${libs}) + # Get the file names of the libraries to be merged + set(libfiles ${libfiles} $) + endforeach() add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}) - else() # general UNIX: use "ar" to extract objects and re-add to a common lib + else() # general UNIX: use "ar" to extract objects and re-add to a common lib foreach(lib ${libs}) set(objlistfile ${lib}.objlist) # list of objects in the input library set(objdir ${lib}.objdir) @@ -149,13 +144,27 @@ function(merge_static_libs TARGET_NAME) DEPENDS ${lib} ${objdir} WORKING_DIRECTORY ${objdir}) - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_AR} ru ${outlibfile} *.o - WORKING_DIRECTORY ${objdir}) + # Empty dummy source file that goes into merged library + set(mergebase ${lib}.mergebase.c) + add_custom_command(OUTPUT ${mergebase} + COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} + DEPENDS ${objlistfile}) + + list(APPEND mergebases "${mergebase}") endforeach() - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_RANLIB} ${outlibfile}) + add_library(${TARGET_NAME} STATIC ${mergebases}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + # Get the file name of the generated library + set(outlibfile "$") + + foreach(lib ${libs}) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_AR} cr ${outlibfile} *.o + COMMAND ${CMAKE_RANLIB} ${outlibfile} + WORKING_DIRECTORY ${lib}.objdir) + endforeach() endif() endfunction(merge_static_libs) From 203364281ed8b86c53c520142b881f00aca5485e Mon Sep 17 00:00:00 2001 From: caoying03 Date: Thu, 6 Jul 2017 16:44:54 +0800 Subject: [PATCH 73/79] enable error clipping in FC layer. --- python/paddle/trainer/config_parser.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 370529ed97..e020be9378 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +import pdb ''' The following functions are available in the config file: @@ -761,8 +762,8 @@ class DotMulOperator(Operator): def check_dims(self): for i in range(2): - config_assert(self.operator_conf.input_sizes[i] == - self.operator_conf.output_size, + config_assert(self.operator_conf.input_sizes[ + i] == self.operator_conf.output_size, "DotMul input_size != output_size") def calc_output_size(self, input_sizes): @@ -1193,8 +1194,7 @@ def parse_image(image, input_layer_name, image_conf): def parse_norm(norm, input_layer_name, norm_conf): norm_conf.norm_type = norm.norm_type config_assert( - norm.norm_type in - ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'], + norm.norm_type in ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'], "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]" % norm.norm_type) norm_conf.channels = norm.channels @@ -1571,7 +1571,13 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase): @config_layer('fc') class FCLayer(LayerBase): - def __init__(self, name, size, inputs, bias=True, **xargs): + def __init__(self, + name, + size, + inputs, + bias=True, + error_clipping_threshold=None, + **xargs): super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) @@ -1589,6 +1595,9 @@ class FCLayer(LayerBase): format) self.create_bias_parameter(bias, self.config.size) + if error_clipping_threshold is not None: + self.config.error_clipping_threshold = error_clipping_threshold + @config_layer('selective_fc') class SelectiveFCLayer(LayerBase): @@ -3425,7 +3434,8 @@ DEFAULT_SETTING = dict( settings = copy.deepcopy(DEFAULT_SETTING) -settings_deprecated = dict(usage_ratio=1., ) +settings_deprecated = dict( + usage_ratio=1., ) trainer_settings = dict( save_dir="./output/model", From 075954c17ceaf422478961d9a5d6aaa364458415 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Thu, 6 Jul 2017 17:40:58 +0800 Subject: [PATCH 74/79] follow comment. --- python/paddle/trainer/config_parser.py | 28 +++++++------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 1fed6db33c..826ba2834a 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1353,7 +1353,8 @@ class LayerBase(object): device=None, active_type="", drop_rate=0., - coeff=None): + coeff=None, + error_clipping_threshold=None): config_assert('@' not in name, "layer name: %s contain special character @" % name) global g_current_submodel @@ -1387,6 +1388,9 @@ class LayerBase(object): elif g_default_device is not None: self.config.device = g_default_device + if error_clipping_threshold is not None: + self.config.error_clipping_threshold = error_clipping_threshold + for input_index in xrange(len(self.inputs)): input = self.inputs[input_index] input_config = None @@ -1571,13 +1575,7 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase): @config_layer('fc') class FCLayer(LayerBase): - def __init__(self, - name, - size, - inputs, - bias=True, - error_clipping_threshold=None, - **xargs): + def __init__(self, name, size, inputs, bias=True, **xargs): super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) @@ -1595,9 +1593,6 @@ class FCLayer(LayerBase): format) self.create_bias_parameter(bias, self.config.size) - if error_clipping_threshold is not None: - self.config.error_clipping_threshold = error_clipping_threshold - @config_layer('selective_fc') class SelectiveFCLayer(LayerBase): @@ -2791,13 +2786,7 @@ class TensorLayer(LayerBase): @config_layer('mixed') class MixedLayer(LayerBase): - def __init__(self, - name, - inputs, - size=0, - bias=True, - error_clipping_threshold=None, - **xargs): + def __init__(self, name, inputs, size=0, bias=True, **xargs): config_assert(inputs, 'inputs cannot be empty') super(MixedLayer, self).__init__( name, 'mixed', size, inputs=inputs, **xargs) @@ -2879,9 +2868,6 @@ class MixedLayer(LayerBase): self.config.bias_size = psize self.create_bias_parameter(bias, psize) - if error_clipping_threshold is not None: - self.config.error_clipping_threshold = error_clipping_threshold - # like MixedLayer, but no bias parameter @config_func From f2a82b16a25c2eb825ddb0a46b4966b01f248f22 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 6 Jul 2017 11:58:43 +0000 Subject: [PATCH 75/79] add print messages --- python/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 361e764e25..7a57d922ef 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -17,15 +17,21 @@ add_custom_target(copy_paddle_master) SET(COPY_PADDLE_MASTER "") if(WITH_GOLANG) SET(COPY_PADDLE_MASTER "copy_paddle_master") + message("paddle_master_lib_path:" ${paddle_master_LIB_PATH}) + message("PROJ_ROOT:" ${PROJ_ROOT}) add_custom_command(TARGET ${COPY_PADDLE_MASTER} COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/ ) add_dependencies(copy_paddle_master paddle_master) endif(WITH_GOLANG) +message("paddle_master_LIB_NAME:" ${paddle_master_LIB_NAME}) +message("CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) +message("OUTPUT_DIR:" ${OUTPUT_DIR}) +message("py_env:" ${py_env}) add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp From 660475b5ab1c6cc295420a527d549dc1f38ba03a Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 6 Jul 2017 12:14:30 +0000 Subject: [PATCH 76/79] modify to add paddle_master name --- python/CMakeLists.txt | 1 + python/setup.py.in | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 7a57d922ef..633d2b3786 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -27,6 +27,7 @@ endif(WITH_GOLANG) message("paddle_master_LIB_NAME:" ${paddle_master_LIB_NAME}) message("CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR}) +message("CMAKE_CURRENT_SOURCE_DIR:" ${CMAKE_CURRENT_SOURCE_DIR}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) diff --git a/python/setup.py.in b/python/setup.py.in index dae0166487..9c77bed15f 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -27,7 +27,7 @@ setup(name='paddle', description='Parallel Distributed Deep Learning', install_requires=setup_requires, packages=packages, - package_data={'paddle.v2.master': ['${paddle_master_LIB_NAME}'], }, + package_data={'paddle.v2.master': ['libpaddle_master.so'], }, package_dir={ '': '${CMAKE_CURRENT_SOURCE_DIR}' }, From b396055499c5bd34bea5753e7ca19e18e2f7044b Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 6 Jul 2017 13:34:40 +0000 Subject: [PATCH 77/79] add -V --- paddle/scripts/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index ab60f1a38d..0579bfcc7a 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -60,7 +60,7 @@ EOF make -j `nproc` if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then pip uninstall -y py-paddle paddle || true - ctest --output-on-failure + ctest -V --output-on-failure fi From 4daa247d80a3f94b8f60fe084bd3887b4b5c698e Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 7 Jul 2017 01:12:48 +0000 Subject: [PATCH 78/79] rm -v --- paddle/scripts/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 0579bfcc7a..ab60f1a38d 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -60,7 +60,7 @@ EOF make -j `nproc` if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then pip uninstall -y py-paddle paddle || true - ctest -V --output-on-failure + ctest --output-on-failure fi From 126e64fc830ba5b787a787fdd2e2b7f7e2ef1939 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 7 Jul 2017 01:35:16 +0000 Subject: [PATCH 79/79] add cmake --- python/CMakeLists.txt | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 633d2b3786..361e764e25 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -17,22 +17,15 @@ add_custom_target(copy_paddle_master) SET(COPY_PADDLE_MASTER "") if(WITH_GOLANG) SET(COPY_PADDLE_MASTER "copy_paddle_master") - message("paddle_master_lib_path:" ${paddle_master_LIB_PATH}) - message("PROJ_ROOT:" ${PROJ_ROOT}) add_custom_command(TARGET ${COPY_PADDLE_MASTER} COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/ ) add_dependencies(copy_paddle_master paddle_master) endif(WITH_GOLANG) -message("paddle_master_LIB_NAME:" ${paddle_master_LIB_NAME}) -message("CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR}) -message("CMAKE_CURRENT_SOURCE_DIR:" ${CMAKE_CURRENT_SOURCE_DIR}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) -message("OUTPUT_DIR:" ${OUTPUT_DIR}) -message("py_env:" ${py_env}) add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp