From 5413af8d11eb8662598e5f89d5f4fa284e1c6fdf Mon Sep 17 00:00:00 2001 From: xzl Date: Fri, 2 Jun 2017 14:28:20 +0800 Subject: [PATCH 01/69] imporve pruning module --- paddle/parameter/ParameterUpdaterHook.cpp | 90 +++++++++++++++++-- proto/ParameterConfig.proto | 2 + python/paddle/trainer/config_parser.py | 15 +++- python/paddle/trainer_config_helpers/attrs.py | 46 +++++++++- python/paddle/v2/attr.py | 2 + 5 files changed, 144 insertions(+), 11 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index f826e8448c..76cc3ecad1 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -25,6 +25,9 @@ limitations under the License. */ #include "paddle/utils/Flags.h" #include "paddle/utils/Util.h" +using std::vector; +using std::pair; + namespace paddle { /** @@ -131,6 +134,73 @@ private: std::vector mask_; }; +class DynamicPruningHook : public IParameterUpdaterHook { +public: + explicit DynamicPruningHook(const ParameterUpdaterHookConfig& hookConfig) + : initCount_(0) { + sparsityRatio_ = hookConfig.sparsity_ratio(); + } + + static bool sortPairAscend(const pair& pair1, + const pair& pair2) { + return pair1.first > pair2.first; + } + + void update(Parameter* para) { + updateThreadChecker_.check(); + auto& vec = para->getBuf(PARAMETER_GRADIENT); + if (vec) { + vec->dotMul(*maskVec_); + } + } + + void generateMask(Parameter* para) { + VectorPtr vec = para->getBuf(PARAMETER_VALUE); + maskTemp_ = Vector::create(para->getSize(), false); + maskTemp_->zeroMem(); + real* dataPtr = maskTemp_->getData(); + + VectorPtr vecCpu = Vector::create(para->getSize(), false); + vecCpu->copyFrom(*vec); + vector> param; + + for (size_t i = 0; i < para->getSize(); i++) + param.push_back(std::make_pair(fabs(vecCpu->getData()[i]), i)); + std::sort(param.begin(), param.end(), sortPairAscend); + + for (size_t i = 0; i < para->getSize() * sparsityRatio_; i++) + dataPtr[param[i].second] = 1.0; + } + + void init(Parameter* para) { + generateMask(para); + size_t initCount = this->initCount_.fetch_add(1); + CHECK_EQ(initCount, 0UL) << "Currently the DynamicPruningHook must invoke " + "in same ParamterUpdater"; + VLOG(3) << "Initialize Parameter " << para; + SetDevice device(para->getDeviceId()); + + // Currently just use a mask vector for hack. + // @TODO(yuyang18): Implemented the mask operation in vector. + if (para->useGpu()) { + maskVec_ = Vector::create(para->getSize(), para->useGpu()); + maskVec_->copyFrom(*maskTemp_); + } else { + maskVec_ = maskTemp_; + } + + auto& vec = para->getBuf(PARAMETER_VALUE); + vec->dotMul(*maskVec_); + } + +private: + SameThreadChecker updateThreadChecker_; + std::atomic initCount_; + VectorPtr maskVec_; + VectorPtr maskTemp_; + real sparsityRatio_; +}; + IParameterUpdaterHook::IParameterUpdaterHook() {} IParameterUpdaterHook::~IParameterUpdaterHook() {} @@ -156,8 +226,7 @@ private: static WeakKVCache, IParameterUpdaterHook, - StringIntPairHasher> - g_hookCache_; + StringIntPairHasher> g_hookCache_; /** * ParameterUpdaterHook actually factory method. @@ -165,11 +234,22 @@ static WeakKVCache, static IParameterUpdaterHook* createImpl( const ParameterUpdaterHookConfig& config) { auto& type = config.type(); - if (type == "pruning") { - if (config.has_purning_mask_filename()) { + if (type == "pruning_static") { + if (config.has_purning_mask_filename()) return new StaticPruningHook(config.purning_mask_filename()); - } + else + LOG(FATAL) << "There must be mask_filename parameter for " << type + << " Hook"; + + } else if (type == "pruning") { + if (config.has_sparsity_ratio()) + return new DynamicPruningHook(config); + else + LOG(FATAL) << "There must be sparsity_ratio parameter for " << type + << " Hook"; } + + LOG(FATAL) << "Unknown Hook type: " << type; return nullptr; } diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto index cbcd0af598..61f4b037cf 100644 --- a/proto/ParameterConfig.proto +++ b/proto/ParameterConfig.proto @@ -26,7 +26,9 @@ enum ParameterInitStrategy { message ParameterUpdaterHookConfig { required string type = 1; + //hook type such as 'pruning', 'pruning_static' optional string purning_mask_filename = 2; + optional double sparsity_ratio = 3; } message ParameterConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 9fe8794691..d80590210f 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -3171,12 +3171,19 @@ def Layer(name, type, **xargs): @config_func def ParameterHook(type, **kwargs): - if type == 'pruning': + if type == 'pruning_static': + hook = ParameterUpdaterHookConfig() + hook.type = type mask_filename = kwargs.get('mask_filename', None) assert mask_filename is not None + hook.pruning_mask_filename = mask_filename + return hook + elif type == 'pruning': hook = ParameterUpdaterHookConfig() hook.type = type - hook.purning_mask_filename = mask_filename + sparsity_ratio = kwargs.get('sparsity_ratio', None) + assert sparsity_ratio is not None + hook.sparsity_ratio = sparsity_ratio return hook else: return None @@ -3283,13 +3290,13 @@ def Parameter(name, if update_hooks is not None: if hasattr(update_hooks, '__call__'): - update_hooks = update_hooks(para.name) + update_hooks = update_hooks() if isinstance(update_hooks, list): for hook in update_hooks: para.update_hooks.extend([hook]) else: - para.update_hooks.extend(update_hooks) + para.update_hooks.extend([update_hooks]) g_parameter_map[name] = para diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index d1167a234c..011147a368 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -14,7 +14,8 @@ from paddle.trainer.config_parser import * __all__ = [ - 'ParamAttr', 'ExtraAttr', 'ParameterAttribute', 'ExtraLayerAttribute' + 'HookAttr', 'ParamAttr', 'ExtraAttr', 'ParameterAttribute', + 'ExtraLayerAttribute' ] @@ -55,6 +56,42 @@ def is_compatible_with(x, Type): return False +class HookAttribute(object): + """ + Hook Attribute object. The hook is an auxiliary operation that occurs + during network propagation. Such as pruning operation, It will cut off + redundant parameters in the network before training. More detail can see + here paddle/parameter/ParameterUpdaterHook.cpp + NOTE: IT IS A HIGH LEVEL USER INTERFACE. + + :param type: Hook type, eg: 'pruning', 'pruning_static' + :type type: string + + :param mask_file: Must be specified if hook type is 'pruning_static', + the network reads the mask from the file to determine which parameters should be cut off + :type mask_file: string + + :param sparsity_ratio: Must be specified if hook type is 'pruning', + the network will hold the sparsity_ratio maximum parameters, and cut off the rest. + :type sparsity_ratio: float number between 0 and 1 + + """ + + def __init__(self, type, mask_filename=None, sparsity_ratio=None): + self.type = type + self.mask_filename = mask_filename + self.sparsity_ratio = sparsity_ratio + assert is_compatible_with(self.sparsity_ratio, + float), 'sparisity_ratio must be float type' + assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparisity must be a flaot between [0, 1] ' + + def __call__(self): + return ParameterHook( + self.type, + mask_filename=self.mask_filename, + sparsity_ratio=self.sparsity_ratio) + + class ParameterAttribute(object): """ Parameter Attributes object. To fine-tuning network training process, user @@ -109,7 +146,8 @@ class ParameterAttribute(object): learning_rate=None, momentum=None, gradient_clipping_threshold=None, - sparse_update=False): + sparse_update=False, + update_hooks=None): self.attr = {} if is_static: @@ -162,6 +200,9 @@ class ParameterAttribute(object): self.attr['gradient_clipping_threshold'] = \ gradient_clipping_threshold + if update_hooks: + self.attr['update_hooks'] = update_hooks + def set_default_parameter_name(self, name): """ Set default parameter name. If parameter not set, then will use default @@ -237,5 +278,6 @@ class ExtraLayerAttribute(object): return attr.attr +HookAttr = HookAttribute ParamAttr = ParameterAttribute ExtraAttr = ExtraLayerAttribute diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py index 32f78614e7..5d23894d73 100644 --- a/python/paddle/v2/attr.py +++ b/python/paddle/v2/attr.py @@ -17,10 +17,12 @@ import paddle.trainer_config_helpers.attrs __all__ = [ "Param", "Extra", + "Hook", ] Param = paddle.trainer_config_helpers.attrs.ParameterAttribute Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute +Hook = paddle.trainer_config_helpers.attrs.HookAttribute for each in paddle.trainer_config_helpers.attrs.__all__: globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each) From 18435f2a738b2baec680eea6fc2648dd094e5c87 Mon Sep 17 00:00:00 2001 From: xzl Date: Fri, 2 Jun 2017 16:31:49 +0800 Subject: [PATCH 02/69] modify the pruning from reading mask to specify sparsity_ratio --- paddle/parameter/ParameterUpdaterHook.cpp | 130 ++---------------- proto/ParameterConfig.proto | 3 +- python/paddle/trainer/config_parser.py | 9 +- python/paddle/trainer_config_helpers/attrs.py | 14 +- 4 files changed, 17 insertions(+), 139 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 76cc3ecad1..e29494868b 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -19,130 +19,31 @@ limitations under the License. */ #include #include #include +#include #include "paddle/math/Vector.h" #include "paddle/parameter/Parameter.h" #include "paddle/utils/Flags.h" #include "paddle/utils/Util.h" -using std::vector; -using std::pair; - namespace paddle { /** * The static pruning hook - * - * Static means user load a mask map before training started. This map will - * define which link/weight between neural is disabled. + * Static means user specific a sparsity_ratio map before training started. The + * network will + * hold the sparsity_ratio maximum numbers of parameters, and cut off the rest. */ -class StaticPruningHook : public IParameterUpdaterHook { -public: - /** - * The Mask Map Header. - * The map file started with this header. - * - * In Version 0, reset file will be: - * contains header.size bit, each bit means such weight is enabled or not. - * if bit is 1, then such weight is enabled. - * at end, the file will round to byte, and the low bits of end byte will be - * filled by zero. - * - */ - struct StaticMaskHeader { - uint32_t version; - size_t size; - } __attribute__((__packed__)); - - explicit StaticPruningHook(const std::string& mask_filename) : initCount_(0) { - bool ok = this->loadMaskFile(mask_filename); - if (!ok) { - LOG(WARNING) << "Fail to load mask file " << mask_filename - << " in current directory, searching in init_model_path"; - std::string combineMaskFilename = - path::join(FLAGS_init_model_path, mask_filename); - CHECK(this->loadMaskFile(combineMaskFilename)) - << "Cannot load " << mask_filename << " in ./" << mask_filename - << " and " << combineMaskFilename; - } - VLOG(3) << mask_filename << " mask size = " << this->mask_.size(); - } - void update(Parameter* para) { - updateThreadChecker_.check(); - auto& vec = para->getBuf(PARAMETER_GRADIENT); - if (vec) { - vec->dotMul(*maskVec_); - } - } - - void init(Parameter* para) { - size_t initCount = this->initCount_.fetch_add(1); - CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke " - "in same ParamterUpdater"; - VLOG(3) << "Initialize Parameter " << para; - SetDevice device(para->getDeviceId()); - - auto maskVec = Vector::create(this->mask_.size(), false); - { // Initialize maskVec with float mask vector - real* dataPtr = maskVec->getData(); - size_t i = 0; - for (bool m : mask_) { - dataPtr[i++] = m ? 1.0 : 0.0; - } - } - - // Currently just use a mask vector for hack. - // @TODO(yuyang18): Implemented the mask operation in vector. - if (para->useGpu()) { - maskVec_ = Vector::create(this->mask_.size(), para->useGpu()); - maskVec_->copyFrom(*maskVec); - } else { - maskVec_ = maskVec; - } - - auto& vec = para->getBuf(PARAMETER_VALUE); - vec->dotMul(*maskVec_); - } - -private: - bool loadMaskFile(const std::string& mask_filename) { - std::ifstream fin; - fin.open(mask_filename); - if (fin.is_open()) { - StaticMaskHeader header; - fin.read(reinterpret_cast(&header), sizeof(StaticMaskHeader)); - CHECK_EQ(header.version, 0UL); - mask_.resize(header.size); - uint8_t buf; - for (size_t i = 0; i < header.size; ++i, buf <<= 1) { - if (i % 8 == 0) { - fin.read(reinterpret_cast(&buf), sizeof(uint8_t)); - } - mask_[i] = buf & 0x80; - } - fin.close(); - return true; - } else { - return false; - } - } - - SameThreadChecker updateThreadChecker_; - std::atomic initCount_; - VectorPtr maskVec_; - std::vector mask_; -}; - -class DynamicPruningHook : public IParameterUpdaterHook { +class StaticPruningHook : public IParameterUpdaterHook { public: - explicit DynamicPruningHook(const ParameterUpdaterHookConfig& hookConfig) + explicit StaticPruningHook(const ParameterUpdaterHookConfig& hookConfig) : initCount_(0) { sparsityRatio_ = hookConfig.sparsity_ratio(); } - static bool sortPairAscend(const pair& pair1, - const pair& pair2) { + static bool sortPairAscend(const std::pair& pair1, + const std::pair& pair2) { return pair1.first > pair2.first; } @@ -162,7 +63,7 @@ public: VectorPtr vecCpu = Vector::create(para->getSize(), false); vecCpu->copyFrom(*vec); - vector> param; + std::vector> param; for (size_t i = 0; i < para->getSize(); i++) param.push_back(std::make_pair(fabs(vecCpu->getData()[i]), i)); @@ -175,7 +76,7 @@ public: void init(Parameter* para) { generateMask(para); size_t initCount = this->initCount_.fetch_add(1); - CHECK_EQ(initCount, 0UL) << "Currently the DynamicPruningHook must invoke " + CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke " "in same ParamterUpdater"; VLOG(3) << "Initialize Parameter " << para; SetDevice device(para->getDeviceId()); @@ -234,16 +135,9 @@ static WeakKVCache, static IParameterUpdaterHook* createImpl( const ParameterUpdaterHookConfig& config) { auto& type = config.type(); - if (type == "pruning_static") { - if (config.has_purning_mask_filename()) - return new StaticPruningHook(config.purning_mask_filename()); - else - LOG(FATAL) << "There must be mask_filename parameter for " << type - << " Hook"; - - } else if (type == "pruning") { + if (type == "pruning") { if (config.has_sparsity_ratio()) - return new DynamicPruningHook(config); + return new StaticPruningHook(config); else LOG(FATAL) << "There must be sparsity_ratio parameter for " << type << " Hook"; diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto index 61f4b037cf..53e3b94f03 100644 --- a/proto/ParameterConfig.proto +++ b/proto/ParameterConfig.proto @@ -26,8 +26,7 @@ enum ParameterInitStrategy { message ParameterUpdaterHookConfig { required string type = 1; - //hook type such as 'pruning', 'pruning_static' - optional string purning_mask_filename = 2; + //hook type such as 'pruning' optional double sparsity_ratio = 3; } diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 3775375c9b..bebb76d984 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -3171,14 +3171,7 @@ def Layer(name, type, **xargs): @config_func def ParameterHook(type, **kwargs): - if type == 'pruning_static': - hook = ParameterUpdaterHookConfig() - hook.type = type - mask_filename = kwargs.get('mask_filename', None) - assert mask_filename is not None - hook.pruning_mask_filename = mask_filename - return hook - elif type == 'pruning': + if type == 'pruning': hook = ParameterUpdaterHookConfig() hook.type = type sparsity_ratio = kwargs.get('sparsity_ratio', None) diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index 011147a368..a0ad8c4452 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -64,32 +64,24 @@ class HookAttribute(object): here paddle/parameter/ParameterUpdaterHook.cpp NOTE: IT IS A HIGH LEVEL USER INTERFACE. - :param type: Hook type, eg: 'pruning', 'pruning_static' + :param type: Hook type, eg: 'pruning' :type type: string - :param mask_file: Must be specified if hook type is 'pruning_static', - the network reads the mask from the file to determine which parameters should be cut off - :type mask_file: string - :param sparsity_ratio: Must be specified if hook type is 'pruning', the network will hold the sparsity_ratio maximum parameters, and cut off the rest. :type sparsity_ratio: float number between 0 and 1 """ - def __init__(self, type, mask_filename=None, sparsity_ratio=None): + def __init__(self, type, sparsity_ratio=None): self.type = type - self.mask_filename = mask_filename self.sparsity_ratio = sparsity_ratio assert is_compatible_with(self.sparsity_ratio, float), 'sparisity_ratio must be float type' assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparisity must be a flaot between [0, 1] ' def __call__(self): - return ParameterHook( - self.type, - mask_filename=self.mask_filename, - sparsity_ratio=self.sparsity_ratio) + return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio) class ParameterAttribute(object): From 092828fbe30e40b72fc25d8ab9c56ac7ecb5afe4 Mon Sep 17 00:00:00 2001 From: xzl Date: Mon, 5 Jun 2017 17:42:33 +0800 Subject: [PATCH 03/69] modify the doc of the interface --- paddle/parameter/ParameterUpdaterHook.cpp | 6 +++--- proto/ParameterConfig.proto | 4 ++-- python/paddle/trainer_config_helpers/attrs.py | 11 ++++------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index e29494868b..5e8c77ced0 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -30,9 +30,9 @@ namespace paddle { /** * The static pruning hook - * Static means user specific a sparsity_ratio map before training started. The - * network will - * hold the sparsity_ratio maximum numbers of parameters, and cut off the rest. + * Static means user specific a sparsity_ratio before training start, and the + * network will prune the parameters based on the sparsity_ratio. More deatils + * can see https://arxiv.org/pdf/1506.02626.pdf. */ class StaticPruningHook : public IParameterUpdaterHook { diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto index 53e3b94f03..360342bac6 100644 --- a/proto/ParameterConfig.proto +++ b/proto/ParameterConfig.proto @@ -25,9 +25,9 @@ enum ParameterInitStrategy { } message ParameterUpdaterHookConfig { + // hook type such as 'pruning' required string type = 1; - //hook type such as 'pruning' - optional double sparsity_ratio = 3; + optional double sparsity_ratio = 2 [default = 0.8]; } message ParameterConfig { diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index a0ad8c4452..556701ca7a 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -59,17 +59,14 @@ def is_compatible_with(x, Type): class HookAttribute(object): """ Hook Attribute object. The hook is an auxiliary operation that occurs - during network propagation. Such as pruning operation, It will cut off - redundant parameters in the network before training. More detail can see - here paddle/parameter/ParameterUpdaterHook.cpp + during network propagation. NOTE: IT IS A HIGH LEVEL USER INTERFACE. - + :param type: Hook type, eg: 'pruning' :type type: string - :param sparsity_ratio: Must be specified if hook type is 'pruning', - the network will hold the sparsity_ratio maximum parameters, and cut off the rest. - :type sparsity_ratio: float number between 0 and 1 + :param sparsity_ratio: Must be specified if hook type is 'pruning' + :type sparsity_ratio: float or None """ From 597a58c3efe015be43e1e20a20a04921a9ae7c60 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 13 Jun 2017 23:52:16 +0800 Subject: [PATCH 04/69] Add DetectionMAPEvaluator. --- .../evaluators/DetectionMAPEvaluator.cpp | 312 ++++++++++++++++++ paddle/gserver/tests/test_Evaluator.cpp | 17 + proto/ModelConfig.proto | 9 + python/paddle/trainer/config_parser.py | 43 ++- .../trainer_config_helpers/evaluators.py | 105 ++++-- 5 files changed, 453 insertions(+), 33 deletions(-) create mode 100644 paddle/gserver/evaluators/DetectionMAPEvaluator.cpp diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp new file mode 100644 index 0000000000..7d326c2db1 --- /dev/null +++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp @@ -0,0 +1,312 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Evaluator.h" +#include "paddle/gserver/layers/DetectionUtil.h" + +using std::map; +using std::vector; +using std::pair; +using std::make_pair; + +namespace paddle { + +/** + * @brief detection map Evaluator + * + * The config file api is detection_map_evaluator. + */ +class DetectionMAPEvaluator : public Evaluator { +public: + DetectionMAPEvaluator() + : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {} + + virtual void start() { + Evaluator::start(); + allTruePos_.clear(); + allFalsePos_.clear(); + numPos_.clear(); + } + + virtual real evalImp(std::vector& arguments) { + overlapThreshold_ = config_.overlap_threshold(); + backgroundId_ = config_.background_id(); + evaluateDifficult_ = config_.evaluate_difficult(); + apType_ = config_.ap_type(); + + MatrixPtr detectTmpValue = arguments[0].value; + Matrix::resizeOrCreate(cpuOutput_, + detectTmpValue->getHeight(), + detectTmpValue->getWidth(), + false, + false); + + MatrixPtr labelTmpValue = arguments[1].value; + Matrix::resizeOrCreate(cpuLabel_, + labelTmpValue->getHeight(), + labelTmpValue->getWidth(), + false, + false); + + cpuOutput_->copyFrom(*detectTmpValue); + cpuLabel_->copyFrom(*labelTmpValue); + + Argument label = arguments[1]; + const int* labelIndex = label.sequenceStartPositions->getData(false); + size_t batchSize = label.getNumSequences(); + + vector>> allGTBBoxes; + vector>>> allDetectBBoxes; + + for (size_t n = 0; n < batchSize; ++n) { + map> bboxes; + for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) { + vector bbox; + getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox); + int c = cpuLabel_->getData()[i * 6]; + bboxes[c].push_back(bbox[0]); + } + allGTBBoxes.push_back(bboxes); + } + + size_t imgId = 0; + for (size_t n = 0; n < cpuOutput_->getHeight();) { + map>> bboxes; + while (cpuOutput_->getData()[n * 7] == imgId && + n < cpuOutput_->getHeight()) { + vector label; + vector score; + vector bbox; + getBBoxFromDetectData( + cpuOutput_->getData() + n * 7, 1, label, score, bbox); + bboxes[label[0]].push_back(make_pair(score[0], bbox[0])); + ++n; + } + ++imgId; + if (imgId > batchSize) break; + allDetectBBoxes.push_back(bboxes); + } + + for (size_t n = 0; n < batchSize; ++n) { + for (map>::iterator it = + allGTBBoxes[n].begin(); + it != allGTBBoxes[n].end(); + ++it) { + size_t count = 0; + if (evaluateDifficult_) { + count = it->second.size(); + } else { + for (size_t i = 0; i < it->second.size(); ++i) + if (!(it->second[i].isDifficult)) ++count; + } + if (numPos_.find(it->first) == numPos_.end() && count != 0) { + numPos_[it->first] = count; + } else { + numPos_[it->first] += count; + } + } + } + + // calcTFPos + calcTFPos( + batchSize, allGTBBoxes, allDetectBBoxes, &allTruePos_, &allFalsePos_); + + return 0; + } + + virtual void printStats(std::ostream& os) const { + real mAP = calcMAP(); + os << "Detection mAP=" << mAP * 100; + } + + virtual void distributeEval(ParameterClient2* client) { + LOG(FATAL) << "Distribute detection evaluation not implemented."; + } + +protected: + void calcTFPos(const size_t batchSize, + const vector>>& allGTBBoxes, + const vector>>>& + allDetectBBoxes, + map>>* allTruePos, + map>>* allFalsePos) { + for (size_t n = 0; n < allDetectBBoxes.size(); ++n) { + if (allGTBBoxes[n].size() == 0) { + for (map>>::const_iterator + it = allDetectBBoxes[n].begin(); + it != allDetectBBoxes[n].end(); + ++it) { + size_t label = it->first; + for (size_t i = 0; i < it->second.size(); ++i) { + (*allTruePos)[label].push_back(make_pair(it->second[i].first, 0)); + (*allFalsePos)[label].push_back(make_pair(it->second[i].first, 1)); + } + } + } else { + for (map>>::const_iterator + it = allDetectBBoxes[n].begin(); + it != allDetectBBoxes[n].end(); + ++it) { + size_t label = it->first; + vector> predBBoxes = it->second; + if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) { + for (size_t i = 0; i < predBBoxes.size(); ++i) { + (*allTruePos)[label].push_back(make_pair(predBBoxes[i].first, 0)); + (*allFalsePos)[label].push_back( + make_pair(predBBoxes[i].first, 1)); + } + } else { + vector gtBBoxes = + allGTBBoxes[n].find(label)->second; + vector visited(gtBBoxes.size(), false); + // Sort detections in descend order based on scores + std::sort(predBBoxes.begin(), + predBBoxes.end(), + sortScorePairDescend); + for (size_t i = 0; i < predBBoxes.size(); ++i) { + real maxOverlap = -1.0; + size_t maxIdx = 0; + for (size_t j = 0; j < gtBBoxes.size(); ++j) { + real overlap = + jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]); + if (overlap > maxOverlap) { + maxOverlap = overlap; + maxIdx = j; + } + } + if (maxOverlap > overlapThreshold_) { + if (evaluateDifficult_ || + (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) { + if (!visited[maxIdx]) { + (*allTruePos)[label].push_back( + make_pair(predBBoxes[i].first, 1)); + (*allFalsePos)[label].push_back( + make_pair(predBBoxes[i].first, 0)); + visited[maxIdx] = true; + } else { + (*allTruePos)[label].push_back( + make_pair(predBBoxes[i].first, 0)); + (*allFalsePos)[label].push_back( + make_pair(predBBoxes[i].first, 1)); + } + } + } else { + (*allTruePos)[label].push_back( + make_pair(predBBoxes[i].first, 0)); + (*allFalsePos)[label].push_back( + make_pair(predBBoxes[i].first, 1)); + } + } + } + } + } + } + } + + real calcMAP() const { + real mAP = 0.0; + size_t count = 0; + for (map::const_iterator it = numPos_.begin(); + it != numPos_.end(); + ++it) { + size_t label = it->first; + size_t labelNumPos = it->second; + if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end()) + continue; + vector> labelTruePos = allTruePos_.find(label)->second; + vector> labelFalsePos = + allFalsePos_.find(label)->second; + // Compute average precision. + vector tpCumSum; + getAccumulation(labelTruePos, &tpCumSum); + vector fpCumSum; + getAccumulation(labelFalsePos, &fpCumSum); + std::vector precision, recall; + size_t num = tpCumSum.size(); + // Compute Precision. + for (size_t i = 0; i < num; ++i) { + CHECK_LE(tpCumSum[i], labelNumPos); + precision.push_back(static_cast(tpCumSum[i]) / + static_cast(tpCumSum[i] + fpCumSum[i])); + recall.push_back(static_cast(tpCumSum[i]) / labelNumPos); + } + // VOC2007 style + if (apType_ == "11point") { + vector maxPrecisions(11, 0.0); + int startIdx = num - 1; + for (int j = 10; j >= 0; --j) + for (int i = startIdx; i >= 0; --i) { + if (recall[i] < j / 10.) { + startIdx = i; + if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j]; + break; + } else { + if (maxPrecisions[j] < precision[i]) + maxPrecisions[j] = precision[i]; + } + } + for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11; + ++count; + } else if (apType_ == "Integral") { + // Nature integral + real averagePrecisions = 0.; + real prevRecall = 0.; + for (size_t i = 0; i < num; ++i) { + if (fabs(recall[i] - prevRecall) > 1e-6) + averagePrecisions += precision[i] * fabs(recall[i] - prevRecall); + prevRecall = recall[i]; + } + mAP += averagePrecisions; + ++count; + } else { + LOG(FATAL) << "Unkown ap version: " << apType_; + } + } + if (count != 0) mAP /= count; + return mAP; + } + + void getAccumulation(vector> inPairs, + vector* accuVec) const { + std::stable_sort( + inPairs.begin(), inPairs.end(), sortScorePairDescend); + accuVec->clear(); + size_t sum = 0; + for (size_t i = 0; i < inPairs.size(); ++i) { + sum += inPairs[i].second; + accuVec->push_back(sum); + } + } + + std::string getTypeImpl() const { return "detection_map"; } + + real getValueImpl() const { return calcMAP() * 100; } + +private: + real overlapThreshold_; + bool evaluateDifficult_; + size_t backgroundId_; + std::string apType_; + + MatrixPtr cpuOutput_; + MatrixPtr cpuLabel_; + + map numPos_; + map>> allTruePos_; + map>> allFalsePos_; +}; + +REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator); + +} // namespace paddle diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp index 4f5fdbb37c..93996392d2 100644 --- a/paddle/gserver/tests/test_Evaluator.cpp +++ b/paddle/gserver/tests/test_Evaluator.cpp @@ -138,6 +138,23 @@ void testEvaluatorAll(TestConfig testConf, testEvaluator(testConf, testEvaluatorName, batchSize, false); } +TEST(Evaluator, detection_map) { + TestConfig config; + config.evaluatorConfig.set_type("detection_map"); + config.evaluatorConfig.set_overlap_threshold(0.5); + config.evaluatorConfig.set_background_id(0); + config.evaluatorConfig.set_ap_type("Integral"); + config.evaluatorConfig.set_evaluate_difficult(0); + + config.inputDefs.push_back({INPUT_DATA, "output", 7}); + config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6}); + config.evaluatorConfig.set_evaluate_difficult(false); + testEvaluatorAll(config, "detection_map", 100); + + config.evaluatorConfig.set_evaluate_difficult(true); + testEvaluatorAll(config, "detection_map", 100); +} + TEST(Evaluator, classification_error) { TestConfig config; config.evaluatorConfig.set_type("classification_error"); diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 29270829bb..ebe4f5cbb5 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -489,6 +489,15 @@ message EvaluatorConfig { // Used by ClassificationErrorEvaluator // top # classification error optional int32 top_k = 13 [default = 1]; + + // Used by DetectionMAPEvaluator + optional double overlap_threshold = 14 [default = 0.5]; + + optional int32 background_id = 15 [default = 0]; + + optional bool evaluate_difficult = 16 [default = false]; + + optional string ap_type = 17 [default = "11point"]; } message LinkConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 0792e2d40b..e78dc4f3b4 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1300,20 +1300,23 @@ def parse_maxout(maxout, input_layer_name, maxout_conf): # Define an evaluator @config_func -def Evaluator( - name, - type, - inputs, - chunk_scheme=None, - num_chunk_types=None, - classification_threshold=None, - positive_label=None, - dict_file=None, - result_file=None, - num_results=None, - top_k=None, - delimited=None, - excluded_chunk_types=None, ): +def Evaluator(name, + type, + inputs, + chunk_scheme=None, + num_chunk_types=None, + classification_threshold=None, + positive_label=None, + dict_file=None, + result_file=None, + num_results=None, + top_k=None, + delimited=None, + excluded_chunk_types=None, + overlap_threshold=None, + background_id=None, + evaluate_difficult=None, + ap_type=None): evaluator = g_config.model_config.evaluators.add() evaluator.type = type evaluator.name = MakeLayerNameInSubmodel(name) @@ -1347,6 +1350,18 @@ def Evaluator( if excluded_chunk_types: evaluator.excluded_chunk_types.extend(excluded_chunk_types) + if overlap_threshold is not None: + evaluator.overlap_threshold = overlap_threshold + + if background_id is not None: + evaluator.background_id = background_id + + if evaluate_difficult is not None: + evaluator.evaluate_difficult = evaluate_difficult + + if ap_type is not None: + evaluator.ap_type = ap_type + class LayerBase(object): def __init__( diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py index a5234f3e47..1dcd804803 100644 --- a/python/paddle/trainer_config_helpers/evaluators.py +++ b/python/paddle/trainer_config_helpers/evaluators.py @@ -21,7 +21,8 @@ __all__ = [ "chunk_evaluator", "sum_evaluator", "column_sum_evaluator", "value_printer_evaluator", "gradient_printer_evaluator", "maxid_printer_evaluator", "maxframe_printer_evaluator", - "seqtext_printer_evaluator", "classification_error_printer_evaluator" + "seqtext_printer_evaluator", "classification_error_printer_evaluator", + "detection_map_evaluator" ] @@ -31,10 +32,11 @@ class EvaluatorAttribute(object): FOR_RANK = 1 << 2 FOR_PRINT = 1 << 3 FOR_UTILS = 1 << 4 + FOR_DETECTION = 1 << 5 KEYS = [ "for_classification", "for_regression", "for_rank", "for_print", - "for_utils" + "for_utils", "for_detection" ] @staticmethod @@ -57,22 +59,25 @@ def evaluator(*attrs): return impl -def evaluator_base( - input, - type, - label=None, - weight=None, - name=None, - chunk_scheme=None, - num_chunk_types=None, - classification_threshold=None, - positive_label=None, - dict_file=None, - result_file=None, - num_results=None, - delimited=None, - top_k=None, - excluded_chunk_types=None, ): +def evaluator_base(input, + type, + label=None, + weight=None, + name=None, + chunk_scheme=None, + num_chunk_types=None, + classification_threshold=None, + positive_label=None, + dict_file=None, + result_file=None, + num_results=None, + delimited=None, + top_k=None, + excluded_chunk_types=None, + overlap_threshold=None, + background_id=None, + evaluate_difficult=None, + ap_type=None): """ Evaluator will evaluate the network status while training/testing. @@ -107,6 +112,14 @@ def evaluator_base( :type weight: LayerOutput. :param top_k: number k in top-k error rate :type top_k: int + :param overlap_threshold: In detection tasks to filter detection results + :type overlap_threshold: float + :param background_id: Identifier of background class + :type background_id: int + :param evaluate_difficult: Whether to evaluate difficult objects + :type evaluate_difficult: bool + :param ap_type: How to calculate average persicion + :type ap_type: str """ # inputs type assertions. assert classification_threshold is None or isinstance( @@ -136,7 +149,61 @@ def evaluator_base( delimited=delimited, num_results=num_results, top_k=top_k, - excluded_chunk_types=excluded_chunk_types, ) + excluded_chunk_types=excluded_chunk_types, + overlap_threshold=overlap_threshold, + background_id=background_id, + evaluate_difficult=evaluate_difficult, + ap_type=ap_type) + + +@evaluator(EvaluatorAttribute.FOR_DETECTION) +@wrap_name_default() +def detection_map_evaluator(input, + label, + overlap_threshold=0.5, + background_id=0, + evaluate_difficult=False, + ap_type="11point", + name=None): + """ + Detection mAP Evaluator. It will print mean Average Precision for detection. + + The detection mAP Evaluator according to the detection_output's output count + the true positive and the false positive bbox and integral them to get the + mAP. + + The simple usage is: + + .. code-block:: python + + eval = detection_map_evaluator(input=det_output,label=lbl) + + :param input: Input layer. + :type input: LayerOutput + :param label: Label layer. + :type label: LayerOutput + :param overlap_threshold: The bbox overlap threshold of a true positive. + :type overlap_threshold: float + :param background_id: The background class index. + :type background_id: int + :param evaluate_difficult: Wether evaluate a difficult ground truth. + :type evaluate_difficult: bool + """ + if not isinstance(input, list): + input = [input] + + if label: + input.append(label) + + evaluator_base( + name=name, + type="detection_map", + input=input, + label=label, + overlap_threshold=overlap_threshold, + background_id=background_id, + evaluate_difficult=evaluate_difficult, + ap_type=ap_type) @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION) From 997cef2e63ef4d7c99c58710289f7581d2af08c6 Mon Sep 17 00:00:00 2001 From: xzl Date: Wed, 14 Jun 2017 17:26:08 +0800 Subject: [PATCH 05/69] tiny modify --- paddle/parameter/ParameterUpdaterHook.cpp | 33 +++++++++---------- python/paddle/trainer/config_parser.py | 4 +-- python/paddle/trainer_config_helpers/attrs.py | 8 +++-- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 5e8c77ced0..a581cc047d 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/math/Vector.h" #include "paddle/parameter/Parameter.h" @@ -60,6 +61,7 @@ public: maskTemp_ = Vector::create(para->getSize(), false); maskTemp_->zeroMem(); real* dataPtr = maskTemp_->getData(); + size_t sparsityNum = para->getSize() * (1 - sparsityRatio_); VectorPtr vecCpu = Vector::create(para->getSize(), false); vecCpu->copyFrom(*vec); @@ -67,10 +69,20 @@ public: for (size_t i = 0; i < para->getSize(); i++) param.push_back(std::make_pair(fabs(vecCpu->getData()[i]), i)); - std::sort(param.begin(), param.end(), sortPairAscend); - for (size_t i = 0; i < para->getSize() * sparsityRatio_; i++) - dataPtr[param[i].second] = 1.0; + std::partial_sort(param.begin(), + param.begin() + sparsityNum, + param.end(), + sortPairAscend); + for (size_t i = 0; i < sparsityNum; i++) dataPtr[param[i].second] = 1.0; + + // Currently just use a mask vector for hack. + if (para->useGpu()) { + maskVec_ = Vector::create(para->getSize(), para->useGpu()); + maskVec_->copyFrom(*maskTemp_); + } else { + maskVec_ = maskTemp_; + } } void init(Parameter* para) { @@ -81,15 +93,6 @@ public: VLOG(3) << "Initialize Parameter " << para; SetDevice device(para->getDeviceId()); - // Currently just use a mask vector for hack. - // @TODO(yuyang18): Implemented the mask operation in vector. - if (para->useGpu()) { - maskVec_ = Vector::create(para->getSize(), para->useGpu()); - maskVec_->copyFrom(*maskTemp_); - } else { - maskVec_ = maskTemp_; - } - auto& vec = para->getBuf(PARAMETER_VALUE); vec->dotMul(*maskVec_); } @@ -136,11 +139,7 @@ static IParameterUpdaterHook* createImpl( const ParameterUpdaterHookConfig& config) { auto& type = config.type(); if (type == "pruning") { - if (config.has_sparsity_ratio()) - return new StaticPruningHook(config); - else - LOG(FATAL) << "There must be sparsity_ratio parameter for " << type - << " Hook"; + return new StaticPruningHook(config); } LOG(FATAL) << "Unknown Hook type: " << type; diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index e0147b1b37..3a29c91807 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -3175,8 +3175,8 @@ def ParameterHook(type, **kwargs): hook = ParameterUpdaterHookConfig() hook.type = type sparsity_ratio = kwargs.get('sparsity_ratio', None) - assert sparsity_ratio is not None - hook.sparsity_ratio = sparsity_ratio + if sparsity_ratio is not None: + hook.sparsity_ratio = sparsity_ratio return hook else: return None diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index 556701ca7a..27b54ffdea 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -73,9 +73,11 @@ class HookAttribute(object): def __init__(self, type, sparsity_ratio=None): self.type = type self.sparsity_ratio = sparsity_ratio - assert is_compatible_with(self.sparsity_ratio, - float), 'sparisity_ratio must be float type' - assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparisity must be a flaot between [0, 1] ' + if self.sparsity_ratio is not None: + assert is_compatible_with( + self.sparsity_ratio, + float), 'sparisity_ratio must be float type' + assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparisity must be a flaot between [0, 1] ' def __call__(self): return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio) From 98e4bb79ea3f569b35c69272e0ffebf6613c985a Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Wed, 14 Jun 2017 18:48:32 +0800 Subject: [PATCH 06/69] Create ParameterUpdaterHook.cpp --- paddle/parameter/ParameterUpdaterHook.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index a581cc047d..a4c0cb3099 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -130,7 +130,8 @@ private: static WeakKVCache, IParameterUpdaterHook, - StringIntPairHasher> g_hookCache_; + StringIntPairHasher> + g_hookCache_; /** * ParameterUpdaterHook actually factory method. From 4fbec8233b08dec0608b342563c62ecee946e460 Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Wed, 14 Jun 2017 18:49:00 +0800 Subject: [PATCH 07/69] Update ParameterUpdaterHook.cpp --- paddle/parameter/ParameterUpdaterHook.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index a4c0cb3099..3e3dcd6575 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -130,7 +130,7 @@ private: static WeakKVCache, IParameterUpdaterHook, - StringIntPairHasher> + StringIntPairHasher> g_hookCache_; /** From 5405dc0a65e3bb4a9b807a46bb1296cddce44a7e Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Wed, 14 Jun 2017 19:15:51 +0800 Subject: [PATCH 08/69] Create ParameterUpdaterHook.cpp --- paddle/parameter/ParameterUpdaterHook.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 3e3dcd6575..44fac59200 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -14,13 +14,13 @@ limitations under the License. */ #include "ParameterUpdaterHook.h" +#include #include #include #include #include #include #include -#include #include "paddle/math/Vector.h" #include "paddle/parameter/Parameter.h" From fc9e3e4bda6a4ceaa1ae9e45eb3ef522382bf8e3 Mon Sep 17 00:00:00 2001 From: zlx Date: Fri, 16 Jun 2017 14:29:16 +0800 Subject: [PATCH 09/69] explain the sparsity ratio --- paddle/parameter/ParameterUpdaterHook.cpp | 6 +++--- proto/ParameterConfig.proto | 3 ++- python/paddle/trainer_config_helpers/attrs.py | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 44fac59200..1cc91b727a 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -61,7 +61,7 @@ public: maskTemp_ = Vector::create(para->getSize(), false); maskTemp_->zeroMem(); real* dataPtr = maskTemp_->getData(); - size_t sparsityNum = para->getSize() * (1 - sparsityRatio_); + size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_); VectorPtr vecCpu = Vector::create(para->getSize(), false); vecCpu->copyFrom(*vec); @@ -71,10 +71,10 @@ public: param.push_back(std::make_pair(fabs(vecCpu->getData()[i]), i)); std::partial_sort(param.begin(), - param.begin() + sparsityNum, + param.begin() + nonZeroNum, param.end(), sortPairAscend); - for (size_t i = 0; i < sparsityNum; i++) dataPtr[param[i].second] = 1.0; + for (size_t i = 0; i < nonZeroNum; i++) dataPtr[param[i].second] = 1.0; // Currently just use a mask vector for hack. if (para->useGpu()) { diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto index 360342bac6..580d663246 100644 --- a/proto/ParameterConfig.proto +++ b/proto/ParameterConfig.proto @@ -27,7 +27,8 @@ enum ParameterInitStrategy { message ParameterUpdaterHookConfig { // hook type such as 'pruning' required string type = 1; - optional double sparsity_ratio = 2 [default = 0.8]; + // this represents the ratio of zero element to be set by the Parameter + optional double sparsity_ratio = 2 [default = 0.6]; } message ParameterConfig { diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index 2e4e082efb..bf12ad644d 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -65,7 +65,8 @@ class HookAttribute(object): :param type: Hook type, eg: 'pruning' :type type: string - :param sparsity_ratio: Must be specified if hook type is 'pruning' + :param sparsity_ratio: Must be specified if hook type is 'pruning', + it represents the ratio of the zero elements to be set by the Parameter. :type sparsity_ratio: float or None """ From 885275ee77ddafa28cda0135fa752ca9d8afe1c8 Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Fri, 16 Jun 2017 14:59:18 +0800 Subject: [PATCH 10/69] Update ParameterUpdaterHook.cpp --- paddle/parameter/ParameterUpdaterHook.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 1cc91b727a..738e86a622 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -70,10 +70,8 @@ public: for (size_t i = 0; i < para->getSize(); i++) param.push_back(std::make_pair(fabs(vecCpu->getData()[i]), i)); - std::partial_sort(param.begin(), - param.begin() + nonZeroNum, - param.end(), - sortPairAscend); + std::partial_sort( + param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend); for (size_t i = 0; i < nonZeroNum; i++) dataPtr[param[i].second] = 1.0; // Currently just use a mask vector for hack. From 5f924d5d533831c29f1f5243eb1790467c9aac1a Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 19 Jun 2017 18:15:15 +0800 Subject: [PATCH 11/69] Follow comments. --- doc/api/v2/config/evaluators.rst | 9 +++ .../evaluators/DetectionMAPEvaluator.cpp | 66 +++++++++---------- .../trainer_config_helpers/evaluators.py | 6 +- 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/doc/api/v2/config/evaluators.rst b/doc/api/v2/config/evaluators.rst index 39db51fa4a..9ac972fb19 100644 --- a/doc/api/v2/config/evaluators.rst +++ b/doc/api/v2/config/evaluators.rst @@ -99,3 +99,12 @@ value_printer .. automodule:: paddle.v2.evaluator :members: value_printer :noindex: + +Detection +===== + +detection_map +------------- +.. automodule:: paddle.v2.evaluator + :members: detection_map + :noindex: diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp index 7d326c2db1..9b825db574 100644 --- a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp +++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp @@ -80,21 +80,20 @@ public: allGTBBoxes.push_back(bboxes); } - size_t imgId = 0; - for (size_t n = 0; n < cpuOutput_->getHeight();) { + size_t n = 0; + const real* cpuOutputData = cpuOutput_->getData(); + for (size_t imgId = 0; imgId < batchSize; ++imgId) { map>> bboxes; - while (cpuOutput_->getData()[n * 7] == imgId && - n < cpuOutput_->getHeight()) { + size_t curImgId = static_cast((cpuOutputData + n * 7)[0]); + while (curImgId == imgId && n < cpuOutput_->getHeight()) { vector label; vector score; vector bbox; - getBBoxFromDetectData( - cpuOutput_->getData() + n * 7, 1, label, score, bbox); + getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox); bboxes[label[0]].push_back(make_pair(score[0], bbox[0])); ++n; + curImgId = static_cast((cpuOutputData + n * 7)[0]); } - ++imgId; - if (imgId > batchSize) break; allDetectBBoxes.push_back(bboxes); } @@ -119,15 +118,14 @@ public: } // calcTFPos - calcTFPos( - batchSize, allGTBBoxes, allDetectBBoxes, &allTruePos_, &allFalsePos_); + calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes); return 0; } virtual void printStats(std::ostream& os) const { real mAP = calcMAP(); - os << "Detection mAP=" << mAP * 100; + os << "Detection mAP=" << mAP; } virtual void distributeEval(ParameterClient2* client) { @@ -138,9 +136,7 @@ protected: void calcTFPos(const size_t batchSize, const vector>>& allGTBBoxes, const vector>>>& - allDetectBBoxes, - map>>* allTruePos, - map>>* allFalsePos) { + allDetectBBoxes) { for (size_t n = 0; n < allDetectBBoxes.size(); ++n) { if (allGTBBoxes[n].size() == 0) { for (map>>::const_iterator @@ -149,8 +145,8 @@ protected: ++it) { size_t label = it->first; for (size_t i = 0; i < it->second.size(); ++i) { - (*allTruePos)[label].push_back(make_pair(it->second[i].first, 0)); - (*allFalsePos)[label].push_back(make_pair(it->second[i].first, 1)); + allTruePos_[label].push_back(make_pair(it->second[i].first, 0)); + allFalsePos_[label].push_back(make_pair(it->second[i].first, 1)); } } } else { @@ -162,9 +158,8 @@ protected: vector> predBBoxes = it->second; if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) { for (size_t i = 0; i < predBBoxes.size(); ++i) { - (*allTruePos)[label].push_back(make_pair(predBBoxes[i].first, 0)); - (*allFalsePos)[label].push_back( - make_pair(predBBoxes[i].first, 1)); + allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0)); + allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1)); } } else { vector gtBBoxes = @@ -189,22 +184,21 @@ protected: if (evaluateDifficult_ || (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) { if (!visited[maxIdx]) { - (*allTruePos)[label].push_back( + allTruePos_[label].push_back( make_pair(predBBoxes[i].first, 1)); - (*allFalsePos)[label].push_back( + allFalsePos_[label].push_back( make_pair(predBBoxes[i].first, 0)); visited[maxIdx] = true; } else { - (*allTruePos)[label].push_back( + allTruePos_[label].push_back( make_pair(predBBoxes[i].first, 0)); - (*allFalsePos)[label].push_back( + allFalsePos_[label].push_back( make_pair(predBBoxes[i].first, 1)); } } } else { - (*allTruePos)[label].push_back( - make_pair(predBBoxes[i].first, 0)); - (*allFalsePos)[label].push_back( + allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0)); + allFalsePos_[label].push_back( make_pair(predBBoxes[i].first, 1)); } } @@ -274,7 +268,7 @@ protected: } } if (count != 0) mAP /= count; - return mAP; + return mAP * 100; } void getAccumulation(vector> inPairs, @@ -291,20 +285,22 @@ protected: std::string getTypeImpl() const { return "detection_map"; } - real getValueImpl() const { return calcMAP() * 100; } + real getValueImpl() const { return calcMAP(); } private: - real overlapThreshold_; - bool evaluateDifficult_; - size_t backgroundId_; - std::string apType_; + real overlapThreshold_; // overlap threshold when determining whether matched + bool evaluateDifficult_; // whether evaluate difficult ground truth + size_t backgroundId_; // class index of background + std::string apType_; // how to calculate mAP (Integral or 11point) MatrixPtr cpuOutput_; MatrixPtr cpuLabel_; - map numPos_; - map>> allTruePos_; - map>> allFalsePos_; + map numPos_; // counts of true objects each classification + map>> + allTruePos_; // true positive prediction + map>> + allFalsePos_; // false positive prediction }; REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator); diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py index 1dcd804803..44d52edfa7 100644 --- a/python/paddle/trainer_config_helpers/evaluators.py +++ b/python/paddle/trainer_config_helpers/evaluators.py @@ -166,9 +166,9 @@ def detection_map_evaluator(input, ap_type="11point", name=None): """ - Detection mAP Evaluator. It will print mean Average Precision for detection. + Detection mAP Evaluator. It will print mean Average Precision (mAP) for detection. - The detection mAP Evaluator according to the detection_output's output count + The detection mAP Evaluator based on the output of detection_output layer counts the true positive and the false positive bbox and integral them to get the mAP. @@ -186,7 +186,7 @@ def detection_map_evaluator(input, :type overlap_threshold: float :param background_id: The background class index. :type background_id: int - :param evaluate_difficult: Wether evaluate a difficult ground truth. + :param evaluate_difficult: Whether evaluate a difficult ground truth. :type evaluate_difficult: bool """ if not isinstance(input, list): From 3438d650edee11f3488994370a95ab11696d28d1 Mon Sep 17 00:00:00 2001 From: xuwei06 Date: Mon, 19 Jun 2017 23:41:49 -0700 Subject: [PATCH 12/69] Fix bugs for rnn generation 1. v2.layer.parse_network does not correctly handle the generation output. 2. GatherAgentLayer does not correctly handle generation output when batch_size > 1 3. Fix CustomStackTrace for rnn group --- .../gradientmachines/NeuralNetwork.cpp | 9 +-- .../RecurrentGradientMachine.cpp | 16 +++-- .../RecurrentGradientMachine.h | 1 + paddle/gserver/layers/AgentLayer.cpp | 69 +++++++++---------- .../tests/sample_trainer_nest_rnn_gen.conf | 8 +-- .../trainer/tests/sample_trainer_rnn_gen.conf | 6 +- paddle/utils/CustomStackTrace.h | 6 +- paddle/utils/tests/test_CustomStackTrace.cpp | 1 - python/paddle/v2/layer.py | 16 ++++- 9 files changed, 76 insertions(+), 56 deletions(-) diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index 4512aacc81..a361d7deac 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -241,11 +241,14 @@ void NeuralNetwork::forward(const std::vector& inArgs, dataLayers_[i]->setData(inArgs[i]); } + gLayerStackTrace.set_stage(true); + { for (auto& layer : layers_) { REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str()); gLayerStackTrace.push(layer->getName()); layer->forward(passType); + gLayerStackTrace.pop(layer->getName()); } } @@ -254,9 +257,6 @@ void NeuralNetwork::forward(const std::vector& inArgs, for (auto& layer : outputLayers_) { outArgs->push_back(layer->getOutput()); } - if (passType == PASS_TEST) { - gLayerStackTrace.clear(); - } } void NeuralNetwork::resetState() { @@ -283,9 +283,10 @@ void NeuralNetwork::getState(MachineState& machineState) { } void NeuralNetwork::backward(const UpdateCallback& callback) { - gLayerStackTrace.pop(""); // tell layer trace is during backward. + gLayerStackTrace.set_stage(false); FOR_EACH_R(layer, layers_) { REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str()); + gLayerStackTrace.push((*layer)->getName()); if ((*layer)->needGradient()) { (*layer)->backward(callback); } diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 3e93038022..867c99ede3 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -208,6 +208,7 @@ void RecurrentGradientMachine::init( }); CHECK(subModelConfig != config.sub_models().end()); reversed_ = subModelConfig->reversed(); + generating_ = subModelConfig->has_generator(); inFrameLines_.resize(subModelConfig->in_links_size()); for (size_t i = 0; i < inFrameLines_.size(); ++i) { @@ -538,7 +539,7 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, The outputs are outFramesLines_[i].agentLayer */ - if (inFrameLines_.empty() && passType == PASS_TEST) { + if (generating_) { generateSequence(); return; } // else forward.. @@ -569,6 +570,9 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, } void RecurrentGradientMachine::backward(const UpdateCallback& callback) { + if (generating_) { + return; + } REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime"); AsyncGpuBlock asyncGpuBlock; for (int i = maxSequenceLength_ - 1; i >= 0; --i) { @@ -1321,11 +1325,10 @@ void RecurrentGradientMachine::fillGenOutputs() { batchMachineIdVec_.clear(); generator_.ids.clear(); + int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false); + starts[0] = 0; if (numResults > 1) { real* probs = generator_.outArg.in->getData(); - int* starts = - generator_.outArg.sequenceStartPositions->getMutableData(false); - starts[0] = 0; for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t j = 0; j < finalPaths_[i].size(); ++j) { Path& path = finalPaths_[i][j]; @@ -1348,7 +1351,10 @@ void RecurrentGradientMachine::fillGenOutputs() { } else { for (size_t i = 0; i < finalPaths_.size(); ++i) { CHECK(!finalPaths_[i].empty()); - generator_.ids = finalPaths_[i][0].ids; + generator_.ids.insert(generator_.ids.begin(), + finalPaths_[i][0].ids.begin(), + finalPaths_[i][0].ids.end()); + starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size(); } } } diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index 8d94d7e2df..8e30883ac7 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -414,6 +414,7 @@ protected: std::vector ids; // store generated sequences Argument outArg; // final output argument }; + bool generating_; Generator generator_; std::vector> frames_; diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 31463823b3..512932d9a5 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -109,6 +109,40 @@ void GatherAgentLayer::forwardValue(PassType passType) { } } +namespace { + +// dest[index[i]] <- src[i] for each i +void copyElements(const IVector& srcVec, + const IVector& indexVec, + IVector& destVec) { + const int* src = srcVec.getData(); + const int* index = indexVec.getData(); + int* dest = destVec.getData(); + int len = indexVec.getSize(); + CHECK_EQ(srcVec.getSize(), indexVec.getSize()); + for (int i = 0; i < len; ++i) { + dest[index[i]] = src[i]; + } +} +} + +void GatherAgentLayer::forwardIds(PassType passType) { + IVectorPtr realId = realLayers_[0]->getOutputLabel(); + if (!realId) return; + + IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_); + IVectorPtr outId = output_.ids; + idsVec_.resize(idIndex_.size()); + + for (size_t i = 0; i < realLayers_.size(); ++i) { + const IVectorPtr& realId = realLayers_[i]->getOutputLabel(); + idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i], + /* size */ realId->getSize(), + useGpu_); + execViaCpu(©Elements, *realId, *idsVec_[i], *outId); + } +} + void GatherAgentLayer::backward(const UpdateCallback& callback) { (void)callback; const MatrixPtr& outputGrad = getOutputGrad(); @@ -174,41 +208,6 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) { REGISTER_LAYER(gather_agent, GatherAgentLayer); REGISTER_LAYER(scatter_agent, ScatterAgentLayer); -void GatherAgentLayer::forwardIds(PassType passType) { - int height = 0; - IVectorPtr idReal = realLayers_[0]->getOutputLabel(); - - if (!idReal) return; - - if (output_.subSequenceStartPositions) { - int* starts = output_.subSequenceStartPositions->getMutableData(false); - // Gather generator.idsVec - // if is beam search generation result. Get first result. - if (idReal->getData()[idReal->getSize() - 1] == -1) { - for (size_t i = 0; i < realLayers_.size(); ++i) { - // The first element stores first result size - idReal = realLayers_[i]->getOutputLabel(); - idReal->subVecFrom(*idReal, 1, idReal->getData()[0]); - } - } - for (size_t i = 0; i < realLayers_.size(); ++i) { - CHECK(realLayers_[i]->getOutputLabel()); - starts[i] = height; - height += realLayers_[i]->getOutputLabel()->getSize(); - } - starts[realLayers_.size()] = height; - output_.sequenceStartPositions->getMutableData(false)[1] = height; - - IVector::resizeOrCreate(output_.ids, height, false); - for (size_t i = 0; i < realLayers_.size(); ++i) { - output_.ids->subVec(starts[i], starts[i + 1] - starts[i]) - ->copyFrom(*realLayers_[i]->getOutputLabel()); - } - } else { - LOG(FATAL) << "Not implemented"; - } -} - void ScatterAgentLayer::forwardSequence(PassType passType) { Layer::forward(passType); CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId()); diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf index d669fbc40c..741a0aa71d 100644 --- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf +++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf @@ -35,7 +35,7 @@ def outer_step(dummy_data): embedding_size=num_words)] def inner_step(dummy_memory, predict_word): - + # simplified RNN for testing with mixed_layer(size=num_words) as layer: layer += full_matrix_projection(input=predict_word, @@ -46,15 +46,15 @@ def outer_step(dummy_data): param_attr=ParamAttr(name="wordvec")) return out - + beam_gen = beam_search(name="rnn_gen", step=inner_step, input=gen_inputs, bos_id=0, eos_id=num_words-1, beam_size=2 if beam_flag else 1, - num_results_per_sample=2 if beam_flag else 1, - max_length=10) + num_results_per_sample=1, + max_length=10) return beam_gen beam_gen_concat = recurrent_group(name="rnn_gen_concat", diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf index 2b337282f6..58d27f15ae 100644 --- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf +++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf @@ -33,7 +33,7 @@ gen_inputs = [StaticInput(input=dummy_data, size=2), embedding_size=num_words)] def step(dummy_memory, predict_word): - + # simplified RNN for testing with mixed_layer(size=num_words) as layer: layer += full_matrix_projection(input=predict_word, @@ -44,7 +44,7 @@ def step(dummy_memory, predict_word): param_attr=ParamAttr(name="wordvec")) return out - + beam_gen = beam_search(name="rnn_gen", step=step, input=gen_inputs, @@ -52,7 +52,7 @@ beam_gen = beam_search(name="rnn_gen", eos_id=num_words-1, beam_size=2 if beam_flag else 1, num_results_per_sample=2 if beam_flag else 1, - max_length=10) + max_length=10) seqtext_printer_evaluator(input=beam_gen, id_input=sent_id, diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h index 6992e85622..52a6df9497 100644 --- a/paddle/utils/CustomStackTrace.h +++ b/paddle/utils/CustomStackTrace.h @@ -55,13 +55,17 @@ public: * Else, just set status to popping. */ void pop(const T& item) { - pushing() = false; auto& s = this->stack(); if (item == s.top()) { s.pop(); } } + /** + * @brief Indicate whether we are at forward or backward stage of computation + */ + void set_stage(bool isForward) { pushing() = isForward; } + /** * @brief clear current thread stack. */ diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp index b5d9f93f13..c320074fba 100644 --- a/paddle/utils/tests/test_CustomStackTrace.cpp +++ b/paddle/utils/tests/test_CustomStackTrace.cpp @@ -72,7 +72,6 @@ TEST(CustomStackTrace, normalTrain) { for (size_t i = 0; i < layerSize; ++i) { tracer.push("layer_" + paddle::str::to_string(i)); } - tracer.pop(""); for (size_t i = 0; i < layerSize; ++i) { tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i)); } diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index bbb9c3ea8c..4ade1c6f32 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -45,12 +45,12 @@ __all__ = ['data', 'parse_network'] def __need_to_keep__(name): return name in [ 'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType', - 'layer_support' + 'layer_support', 'BaseGeneratedInput' ] def __need_to_wrap__(name): - return name not in ['AggregateLevel', 'ExpandLevel'] + return name not in ['AggregateLevel', 'ExpandLevel', 'BaseGeneratedInput'] def __convert_name__(inname): @@ -199,6 +199,15 @@ def __get_used_submodels__(layer_names): return submodel_names +def __get_submodel_data_out_links__(): + data_links = set() + for submodel in cp.g_config.model_config.sub_models: + for link in submodel.out_links: + if cp.g_layer_map[link.link_name].type == 'data': + data_links.add(link.link_name) + return data_links + + def __get_used_evaluators__(layer_names): evaluator_names = set() for e in cp.g_config.model_config.evaluators: @@ -264,6 +273,7 @@ def parse_network(output_layers, extra_layers=None): submodel_names = __get_used_submodels__(layer_names) submodel_names.add('root') evaluator_names = __get_used_evaluators__(layer_names) + data_out_links = __get_submodel_data_out_links__() input_layer_names = set() output_layer_names = set() @@ -279,7 +289,7 @@ def parse_network(output_layers, extra_layers=None): continue model_config.layers.extend([l]) if l.type == 'data': - if l.name in model_config.output_layer_names: + if l.name in data_out_links: """ In text generation, the outlink to save the generated word indices is a data_layer defined in recurrent_group. This From 02cc7d90a606875a44d605c18e17855ce8339652 Mon Sep 17 00:00:00 2001 From: xuwei06 Date: Mon, 19 Jun 2017 13:23:24 -0700 Subject: [PATCH 13/69] Evaluator for recurrent group Make the evaluators inside a recurrent goup true evaluator, meaning that their evaluation results are incorporated into the whole evaluator result. --- .../gradientmachines/NeuralNetwork.cpp | 35 ++++++++++++++++++- .../gserver/gradientmachines/NeuralNetwork.h | 2 ++ .../RecurrentGradientMachine.cpp | 20 ++--------- .../RecurrentGradientMachine.h | 2 -- 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index 4512aacc81..f245c16bfe 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -320,7 +320,7 @@ public: } } - virtual void eval(const NeuralNetwork& nn) { + virtual void eval(const NeuralNetwork& nn) override { for (auto& evaluator : evaluators_) { evaluator->eval(nn); } @@ -395,6 +395,30 @@ private: } }; +class SubnetEvaluator : public CombinedEvaluator { +public: + SubnetEvaluator(const std::string& layerName, + std::unique_ptr&& evaluator) + : layerName_(layerName) { + addEvaluator(std::move(evaluator)); + } + virtual void eval(const NeuralNetwork& nn) override { + const LayerPtr& layer = nn.getLayer(layerName_); + CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel " + << nn.getName(); + bool accessed = false; + layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) { + subnet.eval(evaluators_[0].get()); + accessed = true; + }); + CHECK(accessed) << "There is no subnetwork for layer " << layerName_ + << " in submodel " << nn.getName(); + } + +protected: + std::string layerName_; +}; + Evaluator* NeuralNetwork::makeEvaluator() const { CombinedEvaluator* combinedEvaluator = new CombinedEvaluator(); auto subModelConfig = std::find_if(config_.sub_models().begin(), @@ -421,6 +445,15 @@ Evaluator* NeuralNetwork::makeEvaluator() const { combinedEvaluator->addEvaluator(std::move(evaluator)); } } + for (auto& layer : layers_) { + layer->accessSubNetwork( + [layer, combinedEvaluator](NeuralNetwork& subnet) { + std::unique_ptr subEvaluator(new SubnetEvaluator( + layer->getName(), + std::unique_ptr(subnet.makeEvaluator()))); + combinedEvaluator->addEvaluator(std::move(subEvaluator)); + }); + } } else { for (const EvaluatorConfig& evalConfig : config_.evaluators()) { std::unique_ptr evaluator(Evaluator::create(evalConfig)); diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h index e7b6c43840..12810f6425 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.h +++ b/paddle/gserver/gradientmachines/NeuralNetwork.h @@ -129,6 +129,8 @@ public: static NeuralNetwork* newNeuralNetwork(const std::string& name = "", NeuralNetwork* rootNetwork = nullptr); + const std::string& getName() const { return subModelName_; } + protected: /** * The constructor of NeuralNetwork. diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 3e93038022..5d4b67da84 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -287,10 +287,6 @@ void RecurrentGradientMachine::init( parameterIds_.push_back(para->getID()); } } - - if (subModelConfig->evaluator_names_size() > 0) { - evaluator_.reset(frames_[0]->makeEvaluator()); - } } void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) { @@ -561,9 +557,6 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, std::vector outArgs; frames_[i]->forward(inArgs, &outArgs, passType); } - if (evaluator_ && passType == PASS_TEST) { - this->eval(evaluator_.get()); - } reorganizeOutput(passType); } @@ -577,11 +570,6 @@ void RecurrentGradientMachine::backward(const UpdateCallback& callback) { for (auto& memoryFrameLine : memoryFrameLines_) { memoryFrameLine.bootLayer->backward(nullptr); } - - // call printers here so the gradient can be printed - if (evaluator_) { - this->eval(evaluator_.get()); - } } void RecurrentGradientMachine::forwardBackward( @@ -595,9 +583,9 @@ void RecurrentGradientMachine::forwardBackward( void RecurrentGradientMachine::eval(Evaluator* evaluator) const { // call printers frame by frame for (int i = 0; i < maxSequenceLength_; ++i) { - LOG(INFO) << "Recurrent Layer Group eval frame " << i << " begin"; + VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin"; evaluator->eval(*(frames_[i].get())); - LOG(INFO) << "Recurrent Layer Group eval frame " << i << " end"; + VLOG(2) << "Recurrent Layer Group eval frame " << i << " end"; } } @@ -1093,10 +1081,6 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { copyDataOutlinkFrame(machineCur); - // call value printer - if (evaluator_) { - evaluator_->eval(*(frames_[machineCur].get())); - } // check eos const IVectorPtr& eosVec = eosFrameLine_->layers[machineCur]->getOutput().ids; diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index 8d94d7e2df..9f957a9401 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -428,8 +428,6 @@ protected: std::vector parameterIds_; // parameters actually used by this Layer Group - std::unique_ptr evaluator_; // frame printers in this layer group - // store final argument of outFrameLines_ std::vector dataArgs_; // store each frame's output argument of outFrameLines_ From c9a76ebba2c0b050c157232c13670b17d2ba806d Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 20 Jun 2017 16:03:49 +0800 Subject: [PATCH 14/69] modified xmap reader to process sample by order --- python/paddle/v2/reader/decorator.py | 36 ++++++++++++++++--- .../paddle/v2/reader/tests/decorator_test.py | 18 ++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index c76faa596c..68ffbd6f3d 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -230,7 +230,7 @@ class XmapEndSignal(): pass -def xmap_readers(mapper, reader, process_num, buffer_size): +def xmap_readers(mapper, reader, process_num, buffer_size, order=False): """ Use multiprocess to map samples from reader by a mapper defined by user. And this function contains a buffered decorator. @@ -242,21 +242,32 @@ def xmap_readers(mapper, reader, process_num, buffer_size): :type process_num: int :param buffer_size: max buffer size :type buffer_size: int + :param order: keep the order of reader + :type order: bool :return: the decarated reader :rtype: callable """ end = XmapEndSignal() in_queue = Queue(buffer_size) out_queue = Queue(buffer_size) - + out_order = [0] # define a worker to read samples from reader to in_queue def read_worker(reader, in_queue): for i in reader(): in_queue.put(i) in_queue.put(end) + + # define a worker to read samples from reader to in_queue with order flag + def order_read_worker(reader, in_queue): + in_order = 0 + for i in reader(): + in_queue.put((in_order,i)) + in_order+=1 + in_queue.put(end) # start a read worker in a thread - t = Thread(target=read_worker, args=(reader, in_queue)) + target = order_read_worker if order else read_worker + t = Thread(target=target, args=(reader, in_queue)) t.daemon = True t.start() @@ -270,12 +281,29 @@ def xmap_readers(mapper, reader, process_num, buffer_size): sample = in_queue.get() in_queue.put(end) out_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue by order + def order_handle_worker(in_queue, out_queue, mapper, out_order): + ins = in_queue.get() + while not isinstance(ins, XmapEndSignal): + order, sample = ins + r = mapper(sample) + while order != out_order[0]: + pass + out_queue.put(r) + out_order[0] += 1 + ins = in_queue.get() + in_queue.put(end) + out_queue.put(end) # start several handle_workers + target = order_handle_worker if order else handle_worker + args = (in_queue, out_queue, mapper, out_order) if order else (in_queue, out_queue, mapper) workers = [] for i in xrange(process_num): worker = Thread( - target=handle_worker, args=(in_queue, out_queue, mapper)) + target=target, args=args) worker.daemon = True workers.append(worker) for w in workers: diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py index 734154b979..76db91a44b 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -120,6 +120,24 @@ class TestShuffle(unittest.TestCase): total += 1 self.assertEqual(total, 10) +class TestXmap(unittest.TestCase): + def test_xmap(self): + def mapper(x): + return (x + 1) + orders = (True, False) + thread_nums = (1, 2, 4, 8, 16) + buffered_size = (1, 2, 4, 8, 16) + for order in orders: + for tNum in thread_nums: + for size in buffered_size: + result = [] + for i in paddle.v2.reader.xmap_readers(mapper, reader_creator_10(), tNum, size, order)(): + result.append(i) + if not order: + result.sort() + for idx, e in enumerate(result): + self.assertEqual(e, mapper(idx)) + if __name__ == '__main__': unittest.main() From 8bc07dee4e3c1d01e0c5f5f229fd13cadc74ace8 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 20 Jun 2017 16:11:14 +0800 Subject: [PATCH 15/69] format code --- python/paddle/v2/reader/decorator.py | 17 +++++++++-------- python/paddle/v2/reader/tests/decorator_test.py | 8 ++++++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 68ffbd6f3d..e432003129 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -251,18 +251,19 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): in_queue = Queue(buffer_size) out_queue = Queue(buffer_size) out_order = [0] + # define a worker to read samples from reader to in_queue def read_worker(reader, in_queue): for i in reader(): in_queue.put(i) in_queue.put(end) - + # define a worker to read samples from reader to in_queue with order flag def order_read_worker(reader, in_queue): in_order = 0 for i in reader(): - in_queue.put((in_order,i)) - in_order+=1 + in_queue.put((in_order, i)) + in_order += 1 in_queue.put(end) # start a read worker in a thread @@ -281,7 +282,7 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): sample = in_queue.get() in_queue.put(end) out_queue.put(end) - + # define a worker to handle samples from in_queue by mapper # and put mapped samples into out_queue by order def order_handle_worker(in_queue, out_queue, mapper, out_order): @@ -292,18 +293,18 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): while order != out_order[0]: pass out_queue.put(r) - out_order[0] += 1 + out_order[0] += 1 ins = in_queue.get() in_queue.put(end) out_queue.put(end) # start several handle_workers target = order_handle_worker if order else handle_worker - args = (in_queue, out_queue, mapper, out_order) if order else (in_queue, out_queue, mapper) + args = (in_queue, out_queue, mapper, out_order) if order else ( + in_queue, out_queue, mapper) workers = [] for i in xrange(process_num): - worker = Thread( - target=target, args=args) + worker = Thread(target=target, args=args) worker.daemon = True workers.append(worker) for w in workers: diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py index 76db91a44b..0bd7733955 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -120,10 +120,12 @@ class TestShuffle(unittest.TestCase): total += 1 self.assertEqual(total, 10) + class TestXmap(unittest.TestCase): def test_xmap(self): def mapper(x): return (x + 1) + orders = (True, False) thread_nums = (1, 2, 4, 8, 16) buffered_size = (1, 2, 4, 8, 16) @@ -131,13 +133,15 @@ class TestXmap(unittest.TestCase): for tNum in thread_nums: for size in buffered_size: result = [] - for i in paddle.v2.reader.xmap_readers(mapper, reader_creator_10(), tNum, size, order)(): + for i in paddle.v2.reader.xmap_readers(mapper, + reader_creator_10(), + tNum, size, order)(): result.append(i) if not order: result.sort() for idx, e in enumerate(result): self.assertEqual(e, mapper(idx)) - + if __name__ == '__main__': unittest.main() From b6910529181cdd1d0f560bf71d77a3fed43886f6 Mon Sep 17 00:00:00 2001 From: xuwei06 Date: Tue, 20 Jun 2017 15:10:46 -0700 Subject: [PATCH 16/69] Fix bug of ScatterAgentLayer for generation --- paddle/gserver/layers/AgentLayer.cpp | 61 +++++++++++++++------------- paddle/gserver/layers/AgentLayer.h | 7 +++- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 512932d9a5..15e7411b5f 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -170,23 +170,22 @@ void ScatterAgentLayer::forward(PassType passType) { CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId()); int width = this->getSize(); - if (realOutArg_.hasSeq()) { - forwardSequence(passType); - } else if (realOutArg_.value || realOutArg_.ids) { - output_.subArgFrom( - realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_); - } else { // used in generation - if (realLayer_->getOutput().ids) { - IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_); - output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_); - } - if (realLayer_->getOutput().value) { - int height = ids_->getSize(); - resetOutput(height, width); - - const MatrixPtr& outV = getOutputValue(); - const MatrixPtr& realV = realLayer_->getOutputValue(); - outV->selectRows(*realV, *ids_); + if (selectionMode_) { + forwardWithSelection(passType); + } else { + if (realOutArg_.hasSeq()) { + output_.subArgFrom(realOutArg_, + /* offset */ idIndex_, + idSize_, + width, + useGpu_, + /* trans */ false, + /* seqFlag */ true, + /* seqStart */ seqStartPosIndex_, + /* seqSize */ numSequences_); + } else { + output_.subArgFrom( + realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_); } } } @@ -194,6 +193,8 @@ void ScatterAgentLayer::forward(PassType passType) { void ScatterAgentLayer::backward(const UpdateCallback& callback) { (void)callback; + CHECK(!selectionMode_); + const MatrixPtr& outputGrad = realOutArg_.grad; const MatrixPtr& realGrad = realLayer_->getOutputGrad(); if (realGrad) { @@ -208,7 +209,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) { REGISTER_LAYER(gather_agent, GatherAgentLayer); REGISTER_LAYER(scatter_agent, ScatterAgentLayer); -void ScatterAgentLayer::forwardSequence(PassType passType) { +void ScatterAgentLayer::forwardWithSelection(PassType passType) { Layer::forward(passType); CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId()); @@ -219,17 +220,19 @@ void ScatterAgentLayer::forwardSequence(PassType passType) { AsyncGpuBlock asyncGpuBlock; REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str()); - if (realOutArg_.value || realOutArg_.ids) { - CHECK(realOutArg_.sequenceStartPositions); - output_.subArgFrom(realOutArg_, - /* offset */ idIndex_, - idSize_, - width, - useGpu_, - /* trans */ false, - /* seqFlag */ true, - /* seqStart */ seqStartPosIndex_, - /* seqSize */ numSequences_); + if (!input.hasSeq()) { + if (realLayer_->getOutput().ids) { + IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_); + output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_); + } + if (realLayer_->getOutput().value) { + int height = ids_->getSize(); + resetOutput(height, width); + + const MatrixPtr& outV = getOutputValue(); + const MatrixPtr& realV = realLayer_->getOutputValue(); + outV->selectRows(*realV, *ids_); + } } else { // Putting the generation logic here is really an ugly hack! // used in generation diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h index 461b84b17e..29681b29c6 100644 --- a/paddle/gserver/layers/AgentLayer.h +++ b/paddle/gserver/layers/AgentLayer.h @@ -110,6 +110,9 @@ protected: // of real layer. ICpuGpuVectorPtr inputStartPos_; + // true for setRealLayer, false for setRealLayerAndOutput + bool selectionMode_; + public: explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {} @@ -137,6 +140,7 @@ public: } else { cpuIds_ = ids_; } + selectionMode_ = true; } // set real layer and output, [idIndex, idIndex + idSize) of *ids* @@ -153,6 +157,7 @@ public: idIndex_ = idIndex; idSize_ = idSize; handleBackward_ = handleBackward; + selectionMode_ = false; } void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions, @@ -166,7 +171,7 @@ public: void forward(PassType passType) override; void backward(const UpdateCallback& callback) override; - void forwardSequence(PassType passType); + void forwardWithSelection(PassType passType); }; } // namespace paddle From ff4be82252d797746b3a4169137c7fcfd9ee7039 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Tue, 20 Jun 2017 17:54:10 -0700 Subject: [PATCH 17/69] Handle multiple processes trying to create the data home directory --- python/paddle/v2/dataset/common.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index e09ac1a7a0..72894c24b1 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -27,13 +27,17 @@ __all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader'] DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') -if not os.path.exists(DATA_HOME): - try: - os.makedirs(DATA_HOME) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise - pass +# When running unit tests, there could be multiple processes that +# trying to create DATA_HOME directory simultaneously, so we cannot +# use a if condition to check for the existence of the directory; +# instead, we use the filesystem as the synchronization mechanism by +# catching returned errors. +try: + os.makedirs(DATA_HOME) +except OSError as exc: + if exc.errno != errno.EEXIST: + raise + pass def md5file(fname): From 1eab8cce32b61f201098be482359defbfffc941b Mon Sep 17 00:00:00 2001 From: zlx Date: Wed, 21 Jun 2017 14:31:29 +0800 Subject: [PATCH 18/69] modify the annotations of HookAttribute, Variable declaration --- paddle/parameter/ParameterUpdaterHook.cpp | 31 ++++++++++--------- python/paddle/trainer_config_helpers/attrs.py | 20 +++++++----- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 738e86a622..66e554a70d 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -31,9 +31,9 @@ namespace paddle { /** * The static pruning hook - * Static means user specific a sparsity_ratio before training start, and the + * Static means user specify a sparsity_ratio before training started, and the * network will prune the parameters based on the sparsity_ratio. More deatils - * can see https://arxiv.org/pdf/1506.02626.pdf. + * can be found https://arxiv.org/pdf/1506.02626.pdf. */ class StaticPruningHook : public IParameterUpdaterHook { @@ -57,29 +57,31 @@ public: } void generateMask(Parameter* para) { - VectorPtr vec = para->getBuf(PARAMETER_VALUE); - maskTemp_ = Vector::create(para->getSize(), false); - maskTemp_->zeroMem(); - real* dataPtr = maskTemp_->getData(); + + VectorPtr maskTemp = Vector::create(para->getSize(), false); + maskTemp->zeroMem(); + real* maskTempData = maskTemp->getData(); size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_); - VectorPtr vecCpu = Vector::create(para->getSize(), false); - vecCpu->copyFrom(*vec); + VectorPtr paraVec = para->getBuf(PARAMETER_VALUE); + VectorPtr paraCpuCopy = Vector::create(para->getSize(), false); + + paraCpuCopy->copyFrom(*paraVec); std::vector> param; for (size_t i = 0; i < para->getSize(); i++) - param.push_back(std::make_pair(fabs(vecCpu->getData()[i]), i)); + param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i)); std::partial_sort( param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend); - for (size_t i = 0; i < nonZeroNum; i++) dataPtr[param[i].second] = 1.0; + for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0; // Currently just use a mask vector for hack. if (para->useGpu()) { maskVec_ = Vector::create(para->getSize(), para->useGpu()); - maskVec_->copyFrom(*maskTemp_); + maskVec_->copyFrom(*maskTemp); } else { - maskVec_ = maskTemp_; + maskVec_ = maskTemp; } } @@ -91,15 +93,14 @@ public: VLOG(3) << "Initialize Parameter " << para; SetDevice device(para->getDeviceId()); - auto& vec = para->getBuf(PARAMETER_VALUE); - vec->dotMul(*maskVec_); + auto& paraVec = para->getBuf(PARAMETER_VALUE); + paraVec->dotMul(*maskVec_); } private: SameThreadChecker updateThreadChecker_; std::atomic initCount_; VectorPtr maskVec_; - VectorPtr maskTemp_; real sparsityRatio_; }; diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index bf12ad644d..66163bdc8d 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -58,15 +58,21 @@ def is_compatible_with(x, Type): class HookAttribute(object): """ - Hook Attribute object. The hook is an auxiliary operation that occurs - during network propagation. - NOTE: IT IS A HIGH LEVEL USER INTERFACE. - - :param type: Hook type, eg: 'pruning' + Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs + during training process of a layer with parameters, such as img_conv layer, fc layer. + + :param type: Hook type, currently supported types: + 'pruning' : user specify a sparsity_ratio before training started, and the + network will prune the parameters based on the sparsity_ratio. + eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6) + The specific usage can be paddle.layer.img_conv(input=img, filter_size=3, + num_channels=3, num_filters=64, + param_attr=ParameterAttribute(update_hooks=hk) ) + The pruning deatils can be found https://arxiv.org/pdf/1506.02626.pdf :type type: string :param sparsity_ratio: Must be specified if hook type is 'pruning', - it represents the ratio of the zero elements to be set by the Parameter. + it represents the ratio of the zero elements to be set by the Parameter. :type sparsity_ratio: float or None """ @@ -78,7 +84,7 @@ class HookAttribute(object): assert is_compatible_with( self.sparsity_ratio, float), 'sparisity_ratio must be float type' - assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparisity must be a flaot between [0, 1] ' + assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparisity_ratio must be a float between [0, 1] ' def __call__(self): return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio) From badcdfe1e539ffcad75f601e687a83fd1512cff1 Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Wed, 21 Jun 2017 15:05:41 +0800 Subject: [PATCH 19/69] pserver etcd registration --- go/cmd/pserver/pserver.go | 20 ++++++- go/pserver/client_test.go | 8 ++- go/pserver/service.go | 112 ++++++++++++++++++++++++++++++++++++- go/pserver/service_test.go | 25 ++++++--- go/utils/helper.go | 45 +++++++++++++++ go/utils/helper_test.go | 10 ++++ 6 files changed, 206 insertions(+), 14 deletions(-) create mode 100644 go/utils/helper.go create mode 100644 go/utils/helper_test.go diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index f0be251c24..ddf5ad40fd 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -5,18 +5,34 @@ import ( "net/http" "net/rpc" "strconv" + "time" "github.com/namsral/flag" "github.com/PaddlePaddle/Paddle/go/pserver" + log "github.com/sirupsen/logrus" ) func main() { port := flag.Int("port", 0, "port of the pserver") + etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", + "comma separated endpoint string for pserver to connect to etcd") + etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls") + logLevel := flag.String("log-level", "info", "log level, one of debug") flag.Parse() - s := pserver.NewService() - err := rpc.Register(s) + level, err := log.ParseLevel(*logLevel) + if err != nil { + panic(err) + } + log.SetLevel(level) + + timeout := time.Second * time.Duration((*etcdTimeout)) + s, err := pserver.NewService(*etcdEndpoint, timeout) + if err != nil { + panic(err) + } + err = rpc.Register(s) if err != nil { panic(err) } diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go index d0371a26a1..6ecf1fa08a 100644 --- a/go/pserver/client_test.go +++ b/go/pserver/client_test.go @@ -7,6 +7,7 @@ import ( "strconv" "strings" "testing" + "time" "github.com/PaddlePaddle/Paddle/go/pserver" ) @@ -30,9 +31,12 @@ func init() { port[i] = p go func(l net.Listener) { - s := pserver.NewService() + s, err := pserver.NewService("", time.Second*5) + if err != nil { + panic(err) + } server := rpc.NewServer() - err := server.Register(s) + err = server.Register(s) if err != nil { panic(err) } diff --git a/go/pserver/service.go b/go/pserver/service.go index 78a2bfaf63..a5c76857ab 100644 --- a/go/pserver/service.go +++ b/go/pserver/service.go @@ -1,9 +1,18 @@ package pserver import ( + "context" "errors" "fmt" + "strconv" + "strings" "sync" + "time" + + "github.com/PaddlePaddle/Paddle/go/utils" + "github.com/coreos/etcd/clientv3" + "github.com/coreos/etcd/clientv3/concurrency" + log "github.com/sirupsen/logrus" ) // ElementType is the type of elements of a Parameter. @@ -47,14 +56,113 @@ type Service struct { mu sync.Mutex opt *optimizer paramMap map[string]Parameter + + etcdEndpoints string + etcdClient *clientv3.Client + // etcdTimeout is also used as retry intervals. + etcdTimeout time.Duration + // desired number of pservers in the job. + // assume desired will not change during one training job. + desired int + // FIXME: ensure GetExternalIP gets the correct ip for trainers to connect. + externalIP string } // NewService creates a new service. -func NewService() *Service { +func NewService(endpoints string, timeout time.Duration) (*Service, error) { s := &Service{opt: newOptimizer(sgd, 0.005)} s.paramMap = make(map[string]Parameter) s.initialized = make(chan struct{}) - return s + s.etcdEndpoints = endpoints + s.etcdTimeout = timeout + + var err error + s.externalIP, err = utils.GetExternalIP() + if err != nil { + return nil, err + } + + if endpoints != "" { + // initialize connection to etcd, try + ep := strings.Split(s.etcdEndpoints, ",") + for { + cli, err := clientv3.New(clientv3.Config{ + Endpoints: ep, + DialTimeout: s.etcdTimeout, + }) + if err != nil { + log.Errorf("connect to etcd error: %v", err) + time.Sleep(s.etcdTimeout) + continue + } + s.etcdClient = cli + log.Debugf("inited client to %s", s.etcdEndpoints) + break + } + // wait and set s.desired init value + for { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + resp, err := s.etcdClient.Get(ctx, "/ps_desired") + cancel() + if err != nil { + log.Errorf("getting /ps_desired error: %v", err) + time.Sleep(s.etcdTimeout) + continue + } + for _, ev := range resp.Kvs { + log.Debugf("key: %s, value: %s", ev.Key, ev.Value) + if string(ev.Key) == "/ps_desired" { + s.desired, err = strconv.Atoi(string(ev.Value)) + if err != nil { + log.Errorf("value of /ps_desired invalid %v\n", err) + time.Sleep(s.etcdTimeout) + // NOTE: wait util ps_desired value change + continue + } + } + } + break + } + s.registerPserverEtcd() + } // if endpoints != "" + // Bypass etcd registration if no endpoints specified + return s, nil +} + +// registerPserverEtcd registers pserver node on etcd using transaction. +func (s *Service) registerPserverEtcd() (*clientv3.TxnResponse, error) { + return concurrency.NewSTMRepeatable(context.TODO(), s.etcdClient, func(c concurrency.STM) error { + for i := 0; i < s.desired; i++ { + psKey := "/ps/" + strconv.Itoa(i) + log.Debugf("checking %s", psKey) + ps := c.Get(psKey) + log.Debugf("got value (%s) for key: %s", ps, psKey) + + resp, err := s.etcdClient.Grant(context.TODO(), 5) + if err != nil { + log.Fatal(err) + } + + if ps == "" { + // find the first id and write info + c.Put(psKey, s.externalIP, clientv3.WithLease(resp.ID)) + log.Debugf("set pserver node %s with value %s", psKey, s.externalIP) + ch, kaerr := s.etcdClient.KeepAlive(context.TODO(), resp.ID) + if kaerr != nil { + log.Errorf("keepalive etcd node error: %v", kaerr) + return kaerr + } + // FIXME: does this really needed? + go func(ch <-chan *clientv3.LeaseKeepAliveResponse) { + ka := <-ch + log.Debugf("keepalive: %d\n", ka.TTL) + }(ch) + break + } + } + log.Debug("register finished") + return nil + }) } // InitParam initializes a parameter. diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go index b746d13e1c..f317535592 100644 --- a/go/pserver/service_test.go +++ b/go/pserver/service_test.go @@ -10,12 +10,15 @@ import ( ) func TestFull(t *testing.T) { - s := pserver.NewService() + s, err := pserver.NewService("", time.Second*5) + if err != nil { + t.Error(err) + } var p pserver.Parameter p.Name = "param_a" p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0} p.ElementType = pserver.Int32 - err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil) + err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil) if err != nil { t.FailNow() } @@ -72,8 +75,11 @@ func TestFull(t *testing.T) { } func TestMultipleInit(t *testing.T) { - s := pserver.NewService() - err := s.FinishInitParams(0, nil) + s, err := pserver.NewService("", time.Second*5) + if err != nil { + t.Error(err) + } + err = s.FinishInitParams(0, nil) if err != nil { t.FailNow() } @@ -85,15 +91,18 @@ func TestMultipleInit(t *testing.T) { } func TestUninitialized(t *testing.T) { - s := pserver.NewService() - err := s.SendGrad(pserver.Gradient{}, nil) + s, err := pserver.NewService("", time.Second*5) + err = s.SendGrad(pserver.Gradient{}, nil) if err.Error() != pserver.Uninitialized { t.FailNow() } } func TestBlockUntilInitialized(t *testing.T) { - s := pserver.NewService() + s, err := pserver.NewService("", time.Second*5) + if err != nil { + t.Error(err) + } ch := make(chan struct{}, 2) errCh := make(chan error, 2) var wg sync.WaitGroup @@ -133,7 +142,7 @@ func TestBlockUntilInitialized(t *testing.T) { p.Name = "param_a" p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0} p.ElementType = pserver.Int32 - err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil) + err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil) if err != nil { t.FailNow() } diff --git a/go/utils/helper.go b/go/utils/helper.go new file mode 100644 index 0000000000..3220fd6c78 --- /dev/null +++ b/go/utils/helper.go @@ -0,0 +1,45 @@ +package utils + +import ( + "errors" + "net" +) + +// GetExternalIP returns the ip address of local network interface, not the +// loopback device. +func GetExternalIP() (string, error) { + ifaces, err := net.Interfaces() + if err != nil { + return "", err + } + for _, iface := range ifaces { + if iface.Flags&net.FlagUp == 0 { + continue // interface down + } + if iface.Flags&net.FlagLoopback != 0 { + continue // loopback interface + } + addrs, err := iface.Addrs() + if err != nil { + return "", err + } + for _, addr := range addrs { + var ip net.IP + switch v := addr.(type) { + case *net.IPNet: + ip = v.IP + case *net.IPAddr: + ip = v.IP + } + if ip == nil || ip.IsLoopback() { + continue + } + ip = ip.To4() + if ip == nil { + continue // not an ipv4 address + } + return ip.String(), nil + } + } + return "", errors.New("are you connected to the network?") +} diff --git a/go/utils/helper_test.go b/go/utils/helper_test.go new file mode 100644 index 0000000000..aa7c509768 --- /dev/null +++ b/go/utils/helper_test.go @@ -0,0 +1,10 @@ +package utils + +import "testing" + +func TestGetIP(t *testing.T) { + _, err := GetExternalIP() + if err != nil { + t.Errorf("GetExternalIP returns error : %v\n", err) + } +} From b7a52bd9767de41d65382929b1629e95e35a3fe5 Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Wed, 21 Jun 2017 15:25:02 +0800 Subject: [PATCH 20/69] add started info log --- go/cmd/pserver/pserver.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index ddf5ad40fd..f42c90c6c6 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -43,7 +43,9 @@ func main() { panic(err) } + log.Infof("start pserver at port %d", *port) err = http.Serve(l, nil) + if err != nil { panic(err) } From aaf11fa6259dc0c4cc248a102141b68d94685ad7 Mon Sep 17 00:00:00 2001 From: zlx Date: Wed, 21 Jun 2017 15:44:07 +0800 Subject: [PATCH 21/69] modify the format --- paddle/parameter/ParameterUpdaterHook.cpp | 46 +++++++++++------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 66e554a70d..ba2cb37fa2 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -38,29 +38,28 @@ namespace paddle { class StaticPruningHook : public IParameterUpdaterHook { public: - explicit StaticPruningHook(const ParameterUpdaterHookConfig& hookConfig) + explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig) : initCount_(0) { sparsityRatio_ = hookConfig.sparsity_ratio(); } - static bool sortPairAscend(const std::pair& pair1, - const std::pair& pair2) { + static bool sortPairAscend(const std::pair &pair1, + const std::pair &pair2) { return pair1.first > pair2.first; } - void update(Parameter* para) { + void update(Parameter *para) { updateThreadChecker_.check(); - auto& vec = para->getBuf(PARAMETER_GRADIENT); + auto &vec = para->getBuf(PARAMETER_GRADIENT); if (vec) { vec->dotMul(*maskVec_); } } - void generateMask(Parameter* para) { - + void generateMask(Parameter *para) { VectorPtr maskTemp = Vector::create(para->getSize(), false); maskTemp->zeroMem(); - real* maskTempData = maskTemp->getData(); + real *maskTempData = maskTemp->getData(); size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_); VectorPtr paraVec = para->getBuf(PARAMETER_VALUE); @@ -72,9 +71,10 @@ public: for (size_t i = 0; i < para->getSize(); i++) param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i)); - std::partial_sort( - param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend); - for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0; + std::partial_sort(param.begin(), param.begin() + nonZeroNum, param.end(), + sortPairAscend); + for (size_t i = 0; i < nonZeroNum; i++) + maskTempData[param[i].second] = 1.0; // Currently just use a mask vector for hack. if (para->useGpu()) { @@ -85,7 +85,7 @@ public: } } - void init(Parameter* para) { + void init(Parameter *para) { generateMask(para); size_t initCount = this->initCount_.fetch_add(1); CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke " @@ -93,7 +93,7 @@ public: VLOG(3) << "Initialize Parameter " << para; SetDevice device(para->getDeviceId()); - auto& paraVec = para->getBuf(PARAMETER_VALUE); + auto ¶Vec = para->getBuf(PARAMETER_VALUE); paraVec->dotMul(*maskVec_); } @@ -118,7 +118,7 @@ IParameterUpdaterHook::~IParameterUpdaterHook() {} */ class StringIntPairHasher { public: - size_t operator()(const std::pair& k) const { + size_t operator()(const std::pair &k) const { return intHasher_(strHasher_(k.first) + k.second); } @@ -127,17 +127,15 @@ private: std::hash intHasher_; }; -static WeakKVCache, - IParameterUpdaterHook, - StringIntPairHasher> - g_hookCache_; +static WeakKVCache, IParameterUpdaterHook, + StringIntPairHasher> g_hookCache_; /** * ParameterUpdaterHook actually factory method. */ -static IParameterUpdaterHook* createImpl( - const ParameterUpdaterHookConfig& config) { - auto& type = config.type(); +static IParameterUpdaterHook * +createImpl(const ParameterUpdaterHookConfig &config) { + auto &type = config.type(); if (type == "pruning") { return new StaticPruningHook(config); } @@ -146,11 +144,11 @@ static IParameterUpdaterHook* createImpl( return nullptr; } -std::shared_ptr IParameterUpdaterHook::create( - const ParameterConfig& paramConfig, int idx) { +std::shared_ptr +IParameterUpdaterHook::create(const ParameterConfig ¶mConfig, int idx) { std::pair key = {paramConfig.name(), idx}; return g_hookCache_.get( key, [&] { return createImpl(paramConfig.update_hooks(idx)); }); } -} // namespace paddle +} // namespace paddle From a266292a57613d16806cccd939d68f436731927c Mon Sep 17 00:00:00 2001 From: zlx Date: Wed, 21 Jun 2017 18:37:37 +0800 Subject: [PATCH 22/69] modify format --- paddle/parameter/ParameterUpdaterHook.cpp | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index ba2cb37fa2..968803fc0f 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -71,10 +71,9 @@ public: for (size_t i = 0; i < para->getSize(); i++) param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i)); - std::partial_sort(param.begin(), param.begin() + nonZeroNum, param.end(), - sortPairAscend); - for (size_t i = 0; i < nonZeroNum; i++) - maskTempData[param[i].second] = 1.0; + std::partial_sort( + param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend); + for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0; // Currently just use a mask vector for hack. if (para->useGpu()) { @@ -127,14 +126,16 @@ private: std::hash intHasher_; }; -static WeakKVCache, IParameterUpdaterHook, - StringIntPairHasher> g_hookCache_; +static WeakKVCache, + IParameterUpdaterHook, + StringIntPairHasher> + g_hookCache_; /** * ParameterUpdaterHook actually factory method. */ -static IParameterUpdaterHook * -createImpl(const ParameterUpdaterHookConfig &config) { +static IParameterUpdaterHook *createImpl( + const ParameterUpdaterHookConfig &config) { auto &type = config.type(); if (type == "pruning") { return new StaticPruningHook(config); @@ -144,11 +145,11 @@ createImpl(const ParameterUpdaterHookConfig &config) { return nullptr; } -std::shared_ptr -IParameterUpdaterHook::create(const ParameterConfig ¶mConfig, int idx) { +std::shared_ptr IParameterUpdaterHook::create( + const ParameterConfig ¶mConfig, int idx) { std::pair key = {paramConfig.name(), idx}; return g_hookCache_.get( key, [&] { return createImpl(paramConfig.update_hooks(idx)); }); } -} // namespace paddle +} // namespace paddle From 09cc4408e5d5424fba49fcacbd813a846413f9cf Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 20 Jun 2017 16:03:49 +0800 Subject: [PATCH 23/69] modified xmap reader to process sample by order --- python/paddle/v2/reader/decorator.py | 36 ++++++++++++++++--- .../paddle/v2/reader/tests/decorator_test.py | 18 ++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index c76faa596c..68ffbd6f3d 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -230,7 +230,7 @@ class XmapEndSignal(): pass -def xmap_readers(mapper, reader, process_num, buffer_size): +def xmap_readers(mapper, reader, process_num, buffer_size, order=False): """ Use multiprocess to map samples from reader by a mapper defined by user. And this function contains a buffered decorator. @@ -242,21 +242,32 @@ def xmap_readers(mapper, reader, process_num, buffer_size): :type process_num: int :param buffer_size: max buffer size :type buffer_size: int + :param order: keep the order of reader + :type order: bool :return: the decarated reader :rtype: callable """ end = XmapEndSignal() in_queue = Queue(buffer_size) out_queue = Queue(buffer_size) - + out_order = [0] # define a worker to read samples from reader to in_queue def read_worker(reader, in_queue): for i in reader(): in_queue.put(i) in_queue.put(end) + + # define a worker to read samples from reader to in_queue with order flag + def order_read_worker(reader, in_queue): + in_order = 0 + for i in reader(): + in_queue.put((in_order,i)) + in_order+=1 + in_queue.put(end) # start a read worker in a thread - t = Thread(target=read_worker, args=(reader, in_queue)) + target = order_read_worker if order else read_worker + t = Thread(target=target, args=(reader, in_queue)) t.daemon = True t.start() @@ -270,12 +281,29 @@ def xmap_readers(mapper, reader, process_num, buffer_size): sample = in_queue.get() in_queue.put(end) out_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue by order + def order_handle_worker(in_queue, out_queue, mapper, out_order): + ins = in_queue.get() + while not isinstance(ins, XmapEndSignal): + order, sample = ins + r = mapper(sample) + while order != out_order[0]: + pass + out_queue.put(r) + out_order[0] += 1 + ins = in_queue.get() + in_queue.put(end) + out_queue.put(end) # start several handle_workers + target = order_handle_worker if order else handle_worker + args = (in_queue, out_queue, mapper, out_order) if order else (in_queue, out_queue, mapper) workers = [] for i in xrange(process_num): worker = Thread( - target=handle_worker, args=(in_queue, out_queue, mapper)) + target=target, args=args) worker.daemon = True workers.append(worker) for w in workers: diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py index 734154b979..76db91a44b 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -120,6 +120,24 @@ class TestShuffle(unittest.TestCase): total += 1 self.assertEqual(total, 10) +class TestXmap(unittest.TestCase): + def test_xmap(self): + def mapper(x): + return (x + 1) + orders = (True, False) + thread_nums = (1, 2, 4, 8, 16) + buffered_size = (1, 2, 4, 8, 16) + for order in orders: + for tNum in thread_nums: + for size in buffered_size: + result = [] + for i in paddle.v2.reader.xmap_readers(mapper, reader_creator_10(), tNum, size, order)(): + result.append(i) + if not order: + result.sort() + for idx, e in enumerate(result): + self.assertEqual(e, mapper(idx)) + if __name__ == '__main__': unittest.main() From cadea35a107167edee23b8e3ca0a92ca3d85e859 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 20 Jun 2017 16:11:14 +0800 Subject: [PATCH 24/69] format code --- python/paddle/v2/reader/decorator.py | 17 +++++++++-------- python/paddle/v2/reader/tests/decorator_test.py | 8 ++++++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 68ffbd6f3d..e432003129 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -251,18 +251,19 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): in_queue = Queue(buffer_size) out_queue = Queue(buffer_size) out_order = [0] + # define a worker to read samples from reader to in_queue def read_worker(reader, in_queue): for i in reader(): in_queue.put(i) in_queue.put(end) - + # define a worker to read samples from reader to in_queue with order flag def order_read_worker(reader, in_queue): in_order = 0 for i in reader(): - in_queue.put((in_order,i)) - in_order+=1 + in_queue.put((in_order, i)) + in_order += 1 in_queue.put(end) # start a read worker in a thread @@ -281,7 +282,7 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): sample = in_queue.get() in_queue.put(end) out_queue.put(end) - + # define a worker to handle samples from in_queue by mapper # and put mapped samples into out_queue by order def order_handle_worker(in_queue, out_queue, mapper, out_order): @@ -292,18 +293,18 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): while order != out_order[0]: pass out_queue.put(r) - out_order[0] += 1 + out_order[0] += 1 ins = in_queue.get() in_queue.put(end) out_queue.put(end) # start several handle_workers target = order_handle_worker if order else handle_worker - args = (in_queue, out_queue, mapper, out_order) if order else (in_queue, out_queue, mapper) + args = (in_queue, out_queue, mapper, out_order) if order else ( + in_queue, out_queue, mapper) workers = [] for i in xrange(process_num): - worker = Thread( - target=target, args=args) + worker = Thread(target=target, args=args) worker.daemon = True workers.append(worker) for w in workers: diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py index 76db91a44b..0bd7733955 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -120,10 +120,12 @@ class TestShuffle(unittest.TestCase): total += 1 self.assertEqual(total, 10) + class TestXmap(unittest.TestCase): def test_xmap(self): def mapper(x): return (x + 1) + orders = (True, False) thread_nums = (1, 2, 4, 8, 16) buffered_size = (1, 2, 4, 8, 16) @@ -131,13 +133,15 @@ class TestXmap(unittest.TestCase): for tNum in thread_nums: for size in buffered_size: result = [] - for i in paddle.v2.reader.xmap_readers(mapper, reader_creator_10(), tNum, size, order)(): + for i in paddle.v2.reader.xmap_readers(mapper, + reader_creator_10(), + tNum, size, order)(): result.append(i) if not order: result.sort() for idx, e in enumerate(result): self.assertEqual(e, mapper(idx)) - + if __name__ == '__main__': unittest.main() From d322c94243ef2039c633c3e455a6d3660193804c Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 21 Jun 2017 21:41:53 +0800 Subject: [PATCH 25/69] fix unittest --- python/paddle/v2/reader/tests/decorator_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py index 0bd7733955..bb3c5d220b 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -134,7 +134,7 @@ class TestXmap(unittest.TestCase): for size in buffered_size: result = [] for i in paddle.v2.reader.xmap_readers(mapper, - reader_creator_10(), + reader_creator_10(0), tNum, size, order)(): result.append(i) if not order: From 7cb68a8d9315bd3c3c769e47ee3752867854ee12 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 21 Jun 2017 13:19:40 -0700 Subject: [PATCH 26/69] Add paddle/memory/README.md --- paddle/README.md | 141 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 paddle/README.md diff --git a/paddle/README.md b/paddle/README.md new file mode 100644 index 0000000000..24af37987e --- /dev/null +++ b/paddle/README.md @@ -0,0 +1,141 @@ +In my mind, the memory package works like the following: + +## Design + +### Usage + +To allocate 4KB CPU memory: + +```cpp +p = memory::Alloc(platform::CPUPlace(), 4*1024); +``` + +To allocate 4KB memory on the 3rd GPU: + +```cpp +p = memory::Alloc(platform::GPUPlace(2), 4*1024); +``` + +To free memory and check the so-far used amount of memory on a place: + +```cpp +auto pl = platform::GPUPlace(0); +p = memory::Alloc(pl, 4*1024); +cout << memory::Used(pl); +memory::Free(pl, p); +``` + +### The API + +In `paddle/memory/memory.h` we have: + +```cpp +template void* Alloc(Place, size_t); +template void Free(Place, void*); +} +``` + +These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`: + +```cpp +template<> +void Alloc(CPUPlace p, size_t size) { + return GetCPUBuddyAllocator()->Alloc(size); +} +``` + +and + +```cpp +template<> +void Alloc(GPUPlace)(GPUPlace p, size_t size) { + return GetGPUBuddyAllocator(p.id)->Alloc(size); +} +``` + +### The Implementation + +`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions. + +```cpp +BuddyAllocator* GetCPUBuddyAllocator() { + static BuddyAllocator* a = NULL; + if (a == NULL) { + a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...); + } + return a; +} + +BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static BuddyAllocator* as = NULL; + if (as == NULL) { + as = new BuddyAllocator*[platform::NumGPUs()]; + for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) { + as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...); + } + } + return as[gpu_id); +``` + +#### `BuddyAllocator` + +`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm: + +```cpp +BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) { + ... +} +``` + +Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object: + +```cpp +class BuddyAllocator { + private: + struct Block { + size_t size; + Blobk* left, right; + }; + ... +}; +``` + +#### System Allocators + +The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They hold information about the device, including the amount of memory has been allocated. So that we can call + +- `GPUAllocator::Used` and +- `CPUAllocator::Used` + +to get the amount of memory that has been allocated so far. + + +## Why Such a Design + +I got inspiration from Majel and Caffe2, though above design look different from both. + +### Caffe2 + +In Caffe2, `Tensor::mutable_data()` allocates the memroy. In particular, [`Tensor::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479). + +There are two implementations of `Context`: + +1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory. + +1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. + +### Majel + +In Majel, there are basically two allocator types: + +1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`. +1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`. + +However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces. + +In Majel there are hidden global variables like: + +1. `cpu::SystemAllocator g_cpu_allocator`, and +1. `vector g_gpu_allocators(NUM_GPUS)`. + +Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`. From 0a92908b5ea68daa040155a7088b7f520c16c51d Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 21 Jun 2017 17:02:30 -0700 Subject: [PATCH 27/69] Has to auto format networks.py because CI complains about it. --- python/paddle/trainer_config_helpers/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 1bf59ed484..67154a8d7d 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1381,7 +1381,7 @@ def inputs(layers, *args): if len(args) != 0: layers.extend(args) - Inputs(*[l.name for l in layers]) + Inputs(* [l.name for l in layers]) def outputs(layers, *args): @@ -1424,7 +1424,7 @@ def outputs(layers, *args): assert len(layers) > 0 if HasInputsSet(): # input already set - Outputs(*[l.name for l in layers]) + Outputs(* [l.name for l in layers]) return # just return outputs. if len(layers) != 1: From 106dd4bed2de2b3e71700de4487e9f4ca009df8e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 22 Jun 2017 12:37:51 +0800 Subject: [PATCH 28/69] Using previous image for travis-ci Because travis-ci has been updated Ubuntu Trusty image, it causes Paddle CI building error. Just using old image now for hot-fix, I will add another issue to fix Paddle building in new TravisCI image. Related link https://blog.travis-ci.com/2017-06-21-trusty-updates-2017-Q2-launch --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index f9b4a7e083..87cef10b2b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,4 @@ +group: deprecated-2017Q2 language: cpp cache: directories: From 0b936e9399f2a5f01f6fde1d1b78b56306a8f9ac Mon Sep 17 00:00:00 2001 From: wuyi05 Date: Thu, 22 Jun 2017 15:00:39 +0800 Subject: [PATCH 29/69] update pserver etcd --- go/cmd/pserver/pserver.go | 3 +- go/pserver/service.go | 75 ++++++++++++--------- go/utils/{ => networkhelper}/helper.go | 2 +- go/utils/{ => networkhelper}/helper_test.go | 2 +- 4 files changed, 47 insertions(+), 35 deletions(-) rename go/utils/{ => networkhelper}/helper.go (97%) rename go/utils/{ => networkhelper}/helper_test.go (87%) diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index f42c90c6c6..fe1fe5f6f0 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -18,7 +18,8 @@ func main() { etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", "comma separated endpoint string for pserver to connect to etcd") etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls") - logLevel := flag.String("log-level", "info", "log level, one of debug") + logLevel := flag.String("log-level", "info", + "log level, possible values: debug, info, warning, error, fatal, panic") flag.Parse() level, err := log.ParseLevel(*logLevel) diff --git a/go/pserver/service.go b/go/pserver/service.go index a5c76857ab..7400b48832 100644 --- a/go/pserver/service.go +++ b/go/pserver/service.go @@ -9,7 +9,7 @@ import ( "sync" "time" - "github.com/PaddlePaddle/Paddle/go/utils" + "github.com/PaddlePaddle/Paddle/go/utils/networkhelper" "github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/clientv3/concurrency" log "github.com/sirupsen/logrus" @@ -33,6 +33,9 @@ const ( Float64 ) +// PsDesired is etcd path for store desired pserver count +const PsDesired = "/ps_desired" + // Parameter is a piece of data to sync with the parameter server. type Parameter struct { Name string @@ -68,7 +71,8 @@ type Service struct { externalIP string } -// NewService creates a new service. +// NewService creates a new service, will bypass etcd registration if no +// endpoints specified. func NewService(endpoints string, timeout time.Duration) (*Service, error) { s := &Service{opt: newOptimizer(sgd, 0.005)} s.paramMap = make(map[string]Parameter) @@ -77,7 +81,7 @@ func NewService(endpoints string, timeout time.Duration) (*Service, error) { s.etcdTimeout = timeout var err error - s.externalIP, err = utils.GetExternalIP() + s.externalIP, err = networkhelper.GetExternalIP() if err != nil { return nil, err } @@ -102,67 +106,74 @@ func NewService(endpoints string, timeout time.Duration) (*Service, error) { // wait and set s.desired init value for { ctx, cancel := context.WithTimeout(context.Background(), time.Second) - resp, err := s.etcdClient.Get(ctx, "/ps_desired") + resp, err := s.etcdClient.Get(ctx, PsDesired) cancel() if err != nil { - log.Errorf("getting /ps_desired error: %v", err) + log.Errorf("getting %s error: %v", PsDesired, err) time.Sleep(s.etcdTimeout) continue } - for _, ev := range resp.Kvs { - log.Debugf("key: %s, value: %s", ev.Key, ev.Value) - if string(ev.Key) == "/ps_desired" { - s.desired, err = strconv.Atoi(string(ev.Value)) - if err != nil { - log.Errorf("value of /ps_desired invalid %v\n", err) - time.Sleep(s.etcdTimeout) - // NOTE: wait util ps_desired value change - continue - } + if len(resp.Kvs) != 0 { + s.desired, err = strconv.Atoi(string(resp.Kvs[0].Value)) + if err != nil { + log.Errorf("value of %s invalid %v\n", PsDesired, err) + time.Sleep(s.etcdTimeout) + // NOTE: wait util ps_desired value change + continue } + break + } + } + // try register pserver node on etcd + for { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + _, err := s.registerPserverEtcd(ctx) + cancel() + if err != nil { + log.Warn(err) + time.Sleep(s.etcdTimeout) + continue } break } - s.registerPserverEtcd() } // if endpoints != "" // Bypass etcd registration if no endpoints specified return s, nil } // registerPserverEtcd registers pserver node on etcd using transaction. -func (s *Service) registerPserverEtcd() (*clientv3.TxnResponse, error) { - return concurrency.NewSTMRepeatable(context.TODO(), s.etcdClient, func(c concurrency.STM) error { +func (s *Service) registerPserverEtcd(ctx context.Context) (*clientv3.TxnResponse, error) { + return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error { + registered := false for i := 0; i < s.desired; i++ { psKey := "/ps/" + strconv.Itoa(i) log.Debugf("checking %s", psKey) ps := c.Get(psKey) log.Debugf("got value (%s) for key: %s", ps, psKey) - resp, err := s.etcdClient.Grant(context.TODO(), 5) - if err != nil { - log.Fatal(err) - } - if ps == "" { + resp, err := s.etcdClient.Grant(context.TODO(), 5) + if err != nil { + log.Fatal(err) + } // find the first id and write info c.Put(psKey, s.externalIP, clientv3.WithLease(resp.ID)) log.Debugf("set pserver node %s with value %s", psKey, s.externalIP) - ch, kaerr := s.etcdClient.KeepAlive(context.TODO(), resp.ID) + _, kaerr := s.etcdClient.KeepAlive(context.TODO(), resp.ID) if kaerr != nil { log.Errorf("keepalive etcd node error: %v", kaerr) return kaerr } - // FIXME: does this really needed? - go func(ch <-chan *clientv3.LeaseKeepAliveResponse) { - ka := <-ch - log.Debugf("keepalive: %d\n", ka.TTL) - }(ch) + log.Debug("register finished") + registered = true break } } - log.Debug("register finished") - return nil - }) + if registered == true { + return nil + } + return errors.New("not registerd, may due to already have enough pservers") + }, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads)) } // InitParam initializes a parameter. diff --git a/go/utils/helper.go b/go/utils/networkhelper/helper.go similarity index 97% rename from go/utils/helper.go rename to go/utils/networkhelper/helper.go index 3220fd6c78..fbeaea8f5e 100644 --- a/go/utils/helper.go +++ b/go/utils/networkhelper/helper.go @@ -1,4 +1,4 @@ -package utils +package networkhelper import ( "errors" diff --git a/go/utils/helper_test.go b/go/utils/networkhelper/helper_test.go similarity index 87% rename from go/utils/helper_test.go rename to go/utils/networkhelper/helper_test.go index aa7c509768..4208f9e358 100644 --- a/go/utils/helper_test.go +++ b/go/utils/networkhelper/helper_test.go @@ -1,4 +1,4 @@ -package utils +package networkhelper import "testing" From d3e2db4b4f3efa537a2b85bb88d8d8f3e780f09c Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 22 Jun 2017 08:12:10 -0700 Subject: [PATCH 30/69] Revert changes made by misleading errors from Travis CI --- python/paddle/trainer_config_helpers/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 67154a8d7d..1bf59ed484 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1381,7 +1381,7 @@ def inputs(layers, *args): if len(args) != 0: layers.extend(args) - Inputs(* [l.name for l in layers]) + Inputs(*[l.name for l in layers]) def outputs(layers, *args): @@ -1424,7 +1424,7 @@ def outputs(layers, *args): assert len(layers) > 0 if HasInputsSet(): # input already set - Outputs(* [l.name for l in layers]) + Outputs(*[l.name for l in layers]) return # just return outputs. if len(layers) != 1: From 8cfa48dc88c0c702b30094ca558bf2182e00faba Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 22 Jun 2017 10:27:36 -0700 Subject: [PATCH 31/69] Move README.md from paddle/ to paddle/memory/ --- paddle/{ => memory}/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename paddle/{ => memory}/README.md (100%) diff --git a/paddle/README.md b/paddle/memory/README.md similarity index 100% rename from paddle/README.md rename to paddle/memory/README.md From c617520776c58791d77d1382eba67ac4264916f0 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 22 Jun 2017 10:35:52 -0700 Subject: [PATCH 32/69] In response to comments from Liao Gang and Yu Yang --- paddle/memory/README.md | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/paddle/memory/README.md b/paddle/memory/README.md index 24af37987e..b71ca29696 100644 --- a/paddle/memory/README.md +++ b/paddle/memory/README.md @@ -25,14 +25,16 @@ cout << memory::Used(pl); memory::Free(pl, p); ``` -### The API +### API In `paddle/memory/memory.h` we have: ```cpp -template void* Alloc(Place, size_t); -template void Free(Place, void*); -} +namespace memory { +template void* Alloc(Place, size_t); +template void Free(Place, void*); +template void Used(Place); +} // namespace memory ``` These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`: @@ -48,12 +50,14 @@ and ```cpp template<> -void Alloc(GPUPlace)(GPUPlace p, size_t size) { +void Alloc(GPUPlace p, size_t size) { return GetGPUBuddyAllocator(p.id)->Alloc(size); } ``` -### The Implementation +Similar specializations exist for `Free` and `Used`. + +### Implementation `GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions. @@ -94,7 +98,7 @@ class BuddyAllocator { private: struct Block { size_t size; - Blobk* left, right; + Block* left, right; }; ... }; @@ -102,15 +106,15 @@ class BuddyAllocator { #### System Allocators -The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They hold information about the device, including the amount of memory has been allocated. So that we can call +The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`. A system allocator holds information about a device, including the amount of memory has been allocated, so we can call -- `GPUAllocator::Used` and -- `CPUAllocator::Used` +- `GPUAllocator::Used()` and +- `CPUAllocator::Used()` to get the amount of memory that has been allocated so far. -## Why Such a Design +## Justification I got inspiration from Majel and Caffe2, though above design look different from both. From e558ed0d5eade5d2bf6a1bb37beeb39486e9dd76 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Fri, 23 Jun 2017 01:48:04 +0000 Subject: [PATCH 33/69] fix etcd lease I made a comment in WuYi's PR that this is not necessary, so WuYi removed it. Turns out it's necessary after confirming with coreOS developer. --- go/pserver/service.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/go/pserver/service.go b/go/pserver/service.go index 7400b48832..7e2b841dd8 100644 --- a/go/pserver/service.go +++ b/go/pserver/service.go @@ -159,11 +159,18 @@ func (s *Service) registerPserverEtcd(ctx context.Context) (*clientv3.TxnRespons // find the first id and write info c.Put(psKey, s.externalIP, clientv3.WithLease(resp.ID)) log.Debugf("set pserver node %s with value %s", psKey, s.externalIP) - _, kaerr := s.etcdClient.KeepAlive(context.TODO(), resp.ID) + ch, kaerr := s.etcdClient.KeepAlive(context.TODO(), resp.ID) if kaerr != nil { log.Errorf("keepalive etcd node error: %v", kaerr) return kaerr } + + // Eat the keep alive message so etcd + // will not expire the lease. + go func(ch <-chan *clientv3.LeaseKeepAliveResponse) { + ka := <-ch + log.Debugf("keepalive: %d\n", ka.TTL) + }(ch) log.Debug("register finished") registered = true break From c2fc896f5b2896fc6509e720e7dc08527495927f Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 22 Jun 2017 19:05:28 -0700 Subject: [PATCH 34/69] Simplify Travis CI configuration --- .travis.yml | 2 -- paddle/scripts/travis/build_and_test.sh | 12 ------------ paddle/scripts/travis/{docs.sh => build_doc.sh} | 13 ++++++++----- .../scripts/travis/{precommit.sh => check_style.sh} | 8 ++++---- paddle/scripts/travis/main.sh | 12 +++++------- 5 files changed, 17 insertions(+), 30 deletions(-) delete mode 100755 paddle/scripts/travis/build_and_test.sh rename paddle/scripts/travis/{docs.sh => build_doc.sh} (84%) rename paddle/scripts/travis/{precommit.sh => check_style.sh} (54%) diff --git a/.travis.yml b/.travis.yml index 87cef10b2b..915c23b7ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,6 @@ group: deprecated-2017Q2 language: cpp cache: directories: - - $HOME/third_party - $HOME/.ccache - $HOME/.cache/pip sudo: required @@ -18,7 +17,6 @@ addons: packages: - gcc-4.8 - g++-4.8 - - gfortran-4.8 - git - build-essential - python diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh deleted file mode 100755 index f2cbc56165..0000000000 --- a/paddle/scripts/travis/build_and_test.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -source ./common.sh - -NPROC=1 -export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages -export PYTHONHOME=/opt/python/2.7.12 -export PATH=/opt/python/2.7.12/bin:${PATH} -cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} -NRPOC=`nproc` -make -j $NPROC -make coveralls -sudo make install diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/build_doc.sh similarity index 84% rename from paddle/scripts/travis/docs.sh rename to paddle/scripts/travis/build_doc.sh index c784293695..88264d8c26 100755 --- a/paddle/scripts/travis/docs.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -1,15 +1,18 @@ #!/bin/bash +set -e + +# Create the build directory for CMake. +mkdir -p $TRAVIS_BUILD_DIR/build +cd $TRAVIS_BUILD_DIR/build -# Add set -e, cd to directory. -source ./common.sh # Compile Documentation only. -cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS} +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF mkdir output make -j `nproc` find .. -name '*whl' | xargs pip install # install all wheels. rm -rf * -cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS} -make paddle_docs paddle_docs_cn +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON +make -j `nproc` paddle_docs paddle_docs_cn # check websites for broken links linkchecker doc/en/html/index.html diff --git a/paddle/scripts/travis/precommit.sh b/paddle/scripts/travis/check_style.sh similarity index 54% rename from paddle/scripts/travis/precommit.sh rename to paddle/scripts/travis/check_style.sh index 7a59b1131d..4754bdd4c8 100755 --- a/paddle/scripts/travis/precommit.sh +++ b/paddle/scripts/travis/check_style.sh @@ -1,14 +1,14 @@ #!/bin/bash function abort(){ - echo "Your commit not fit PaddlePaddle code style" 1>&2 - echo "Please use pre-commit scripts to auto-format your code" 1>&2 + echo "Your change doesn't follow PaddlePaddle's code style." 1>&2 + echo "Please use pre-commit to reformat your code and git push again." 1>&2 exit 1 } trap 'abort' 0 set -e -source common.sh -cd .. + +cd $TRAVIS_BUILD_DIR export PATH=/usr/bin:$PATH pre-commit install clang-format --version diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh index 13f2552d29..30afe60f60 100755 --- a/paddle/scripts/travis/main.sh +++ b/paddle/scripts/travis/main.sh @@ -1,13 +1,11 @@ #!/bin/bash cd `dirname $0` -if [ ${JOB} == "BUILD_AND_TEST" ]; then - ./build_and_test.sh -elif [ ${JOB} == "DOCS" ]; then - ./docs.sh +if [ ${JOB} == "DOCS" ]; then + ./build_doc.sh elif [ ${JOB} == "PRE_COMMIT" ]; then - ./precommit.sh + ./check_style.sh else - echo Unknown job ${JOB} - exit 1 + echo "Unknown Travis CI job: ${JOB}" + exit 0 # Don't fail due to unknown Travis CI job. fi From 7cf640b58ddeb2cc91d027ade8a6f326d42b5a8d Mon Sep 17 00:00:00 2001 From: Peng Li Date: Fri, 23 Jun 2017 10:26:46 +0800 Subject: [PATCH 35/69] add coeff parameter to classification_cost --- python/paddle/trainer_config_helpers/layers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index b8ce0373c0..84ed160773 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -3839,7 +3839,8 @@ def classification_cost(input, weight=None, name=None, evaluator=classification_error_evaluator, - layer_attr=None): + layer_attr=None, + coeff=1.): """ classification cost Layer. @@ -3855,6 +3856,8 @@ def classification_cost(input, :param evaluator: Evaluator method. :param layer_attr: layer's extra attribute. :type layer_attr: ExtraLayerAttribute + :param coeff: The coefficient affects the gradient in the backward. + :type coeff: float :return: LayerOutput object. :rtype: LayerOutput """ @@ -3868,6 +3871,7 @@ def classification_cost(input, name=name, type="multi-class-cross-entropy", inputs=ipts, + coeff=coeff, **ExtraLayerAttribute.to_kwargs(layer_attr)) def __add_evaluator__(e): From fba4649bcac265ce720fc8e71f0625f228ad2812 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 23 Jun 2017 10:31:21 +0800 Subject: [PATCH 36/69] Remove `BUILD_AND_TEST` section in travis.yaml --- .travis.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 915c23b7ab..6b4fb4c4b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,6 @@ os: - linux env: - JOB=DOCS - - JOB=BUILD_AND_TEST - JOB=PRE_COMMIT addons: apt: @@ -33,17 +32,6 @@ addons: - libtool - ccache before_install: - - | - if [ ${JOB} == "BUILD_AND_TEST" ]; then - local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE` - if [ $? -eq 0 ]; then # if git diff return no zero, then rerun unit test. - if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)' - then - echo "Only markdown docs were updated, stopping build process." - exit - fi - fi - fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. From 260416559264c7a8d4dc63cd79619752a862cdf4 Mon Sep 17 00:00:00 2001 From: dongzhihong Date: Fri, 23 Jun 2017 10:37:19 +0800 Subject: [PATCH 37/69] "resolve clock skewed" --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 39af60966b..bf227737c5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y \ git python-pip python-dev openssh-server bison \ - wget unzip tar xz-utils bzip2 gzip coreutils \ + wget unzip tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ python-numpy python-matplotlib gcc g++ \ automake locales clang-format-3.8 swig doxygen cmake \ From fdde4eff0da95a170f2f727a8345057f20be09ef Mon Sep 17 00:00:00 2001 From: zlx Date: Fri, 23 Jun 2017 12:00:45 +0800 Subject: [PATCH 38/69] modify some topo --- paddle/parameter/ParameterUpdaterHook.cpp | 2 +- python/paddle/trainer_config_helpers/attrs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 968803fc0f..c8b47687f5 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -32,7 +32,7 @@ namespace paddle { /** * The static pruning hook * Static means user specify a sparsity_ratio before training started, and the - * network will prune the parameters based on the sparsity_ratio. More deatils + * network will prune the parameters based on the sparsity_ratio. More details * can be found https://arxiv.org/pdf/1506.02626.pdf. */ diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index 66163bdc8d..c02306f394 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -84,7 +84,7 @@ class HookAttribute(object): assert is_compatible_with( self.sparsity_ratio, float), 'sparisity_ratio must be float type' - assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparisity_ratio must be a float between [0, 1] ' + assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparsity_ratio must be a float between [0, 1] ' def __call__(self): return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio) From 72c1a7fb5e2871ba3f6384ea28eaeed10aa5e76a Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 22 Jun 2017 21:06:07 -0700 Subject: [PATCH 39/69] Remove common.sh --- paddle/scripts/travis/common.sh | 6 ------ 1 file changed, 6 deletions(-) delete mode 100755 paddle/scripts/travis/common.sh diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh deleted file mode 100755 index f05c7530a3..0000000000 --- a/paddle/scripts/travis/common.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e -mkdir -p ../../../build -cd ../../../build -mkdir -p $HOME/third_party -EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party" From 0cbe120d8c06a1c064293918986264cb320bdb78 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 22 Jun 2017 21:16:07 -0700 Subject: [PATCH 40/69] Remove paddle/script/travis/main.sh --- .travis.yml | 10 ++++------ paddle/scripts/travis/main.sh | 11 ----------- 2 files changed, 4 insertions(+), 17 deletions(-) delete mode 100755 paddle/scripts/travis/main.sh diff --git a/.travis.yml b/.travis.yml index 6b4fb4c4b6..2c46da71e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,8 +9,8 @@ dist: trusty os: - linux env: - - JOB=DOCS - - JOB=PRE_COMMIT + - JOB=build_doc + - JOB=check_style addons: apt: packages: @@ -32,7 +32,7 @@ addons: - libtool - ccache before_install: - - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi + - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker @@ -41,9 +41,7 @@ before_install: - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - - | - timeout 2580 paddle/scripts/travis/main.sh # 43min timeout - RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; + - paddle/scripts/travis/$JOB.sh notifications: email: on_success: change diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh deleted file mode 100755 index 30afe60f60..0000000000 --- a/paddle/scripts/travis/main.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -cd `dirname $0` - -if [ ${JOB} == "DOCS" ]; then - ./build_doc.sh -elif [ ${JOB} == "PRE_COMMIT" ]; then - ./check_style.sh -else - echo "Unknown Travis CI job: ${JOB}" - exit 0 # Don't fail due to unknown Travis CI job. -fi From 1d6b8595490d0d679a18329eccfa53d8bb285b96 Mon Sep 17 00:00:00 2001 From: zlx Date: Fri, 23 Jun 2017 13:11:31 +0800 Subject: [PATCH 41/69] modity topo --- python/paddle/trainer_config_helpers/attrs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index c02306f394..9b9f979bb6 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -68,7 +68,7 @@ class HookAttribute(object): The specific usage can be paddle.layer.img_conv(input=img, filter_size=3, num_channels=3, num_filters=64, param_attr=ParameterAttribute(update_hooks=hk) ) - The pruning deatils can be found https://arxiv.org/pdf/1506.02626.pdf + The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf :type type: string :param sparsity_ratio: Must be specified if hook type is 'pruning', From c89fe83a775b0c8264f00de589d263fc6faec615 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 23 Jun 2017 16:32:05 +0800 Subject: [PATCH 42/69] Fix the problem that protobuf cannot be used as a DEPS argument in cc_library. --- cmake/external/protobuf.cmake | 61 ++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 7340394b1e..ce32b2531e 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -14,11 +14,41 @@ INCLUDE(ExternalProject) +# Print and set the protobuf library information, +# finish this cmake process and exit from this file. macro(PROMPT_PROTOBUF_LIB) + SET(protobuf_DEPS ${ARGN}) + MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}") INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) + + # Assuming that all the protobuf libraries are of the same type. + IF(${PROTOBUF_LIBRARY} MATCHES "${STATIC_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE STATIC) + ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${DYNAMIC_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE SHARED) + ELSE() + MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") + ENDIF() + + ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) + + ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY}) + + ADD_LIBRARY(protoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) + + FOREACH(dep ${protobuf_DEPS}) + ADD_DEPENDENCIES(protobuf ${dep}) + ADD_DEPENDENCIES(protobuf_lite ${dep}) + ADD_DEPENDENCIES(protoc ${dep}) + ENDFOREACH() + + LIST(APPEND external_project_dependencies protobuf) RETURN() endmacro() macro(SET_PROTOBUF_VERSION) @@ -43,8 +73,9 @@ if (NOT "${PROTOBUF_ROOT}" STREQUAL "") endif() FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) - SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME}) - SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME}) + STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) @@ -109,6 +140,8 @@ IF(NOT CMAKE_CROSSCOMPILING) SET_PROTOBUF_VERSION() IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0") SET(PROTOBUF_FOUND OFF) + ELSE() + PROMPT_PROTOBUF_LIB() ENDIF() ENDIF(PROTOBUF_FOUND) ELSE() @@ -120,18 +153,22 @@ ELSE() ENDIF() IF(NOT PROTOBUF_FOUND) - build_protobuf(protobuf FALSE) - LIST(APPEND external_project_dependencies protobuf) + build_protobuf(extern_protobuf FALSE) - SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR} + SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR} CACHE PATH "protobuf include directory." FORCE) - IF(NOT CMAKE_CROSSCOMPILING) - SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE} + SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY} + CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY} + CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} + CACHE FILEPATH "protoc library." FORCE) + + IF(CMAKE_CROSSCOMPILING) + PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf) + ELSE() + SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} CACHE FILEPATH "protobuf executable." FORCE) + PROMPT_PROTOBUF_LIB(extern_protobuf) ENDIF() - SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE) ENDIF(NOT PROTOBUF_FOUND) - -PROMPT_PROTOBUF_LIB() \ No newline at end of file From 16f8508d74bd7d40776ad442a927f89d17960d6b Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 23 Jun 2017 17:46:32 +0800 Subject: [PATCH 43/69] Use CMake system variables, such as CMAKE_STATIC_LIBRARY_PREFIX/SUFFIX, instead. --- cmake/external/openblas.cmake | 3 ++- cmake/external/protobuf.cmake | 12 ++++++------ cmake/system.cmake | 18 ------------------ 3 files changed, 8 insertions(+), 25 deletions(-) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 2341e3785b..5b9d9844ed 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -21,7 +21,8 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) - SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}" + SET(CBLAS_LIBRARIES + "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index ce32b2531e..d43badc1da 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -25,9 +25,9 @@ macro(PROMPT_PROTOBUF_LIB) INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) # Assuming that all the protobuf libraries are of the same type. - IF(${PROTOBUF_LIBRARY} MATCHES "${STATIC_LIBRARY_SUFFIX}$") + IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$") SET(protobuf_LIBTYPE STATIC) - ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${DYNAMIC_LIBRARY_SUFFIX}$") + ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") SET(protobuf_LIBTYPE SHARED) ELSE() MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") @@ -80,16 +80,16 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) SET(${TARGET_NAME}_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_PROTOC_EXECUTABLE - "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" PARENT_SCOPE) SET(OPTIONAL_CACHE_ARGS "") diff --git a/cmake/system.cmake b/cmake/system.cmake index 904652413e..3b5cbfdd63 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -84,24 +84,6 @@ IF(DEFINED CMAKE_SYSTEM_NAME) ENDIF() ENDIF() -# prefix and suffix on different os -IF(WIN32) - SET(LIBRARY_PREFIX "") - SET(SHARED_LIBRARY_SUFFIX ".dll") - SET(STATIC_LIBRARY_SUFFIX ".lib") - SET(EXECUTABLE_SUFFIX ".exe") -ELSE(WIN32) - SET(LIBRARY_PREFIX "lib") - IF(APPLE) - SET(SHARED_LIBRARY_SUFFIX ".dylib") - ELSE(APPLE) - SET(SHARED_LIBRARY_SUFFIX ".so") - ENDIF(APPLE) - - SET(STATIC_LIBRARY_SUFFIX ".a") - SET(EXECUTABLE_SUFFIX "") -ENDIF(WIN32) - # external dependencies log output SET(EXTERNAL_PROJECT_LOG_ARGS LOG_DOWNLOAD 0 # Wrap download in script to log output From b55df90dfdf6b9720548613885d291ae8769705b Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Fri, 23 Jun 2017 11:42:48 -0700 Subject: [PATCH 44/69] Remove unnecessary preamble --- paddle/memory/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/memory/README.md b/paddle/memory/README.md index b71ca29696..fd32d07ef4 100644 --- a/paddle/memory/README.md +++ b/paddle/memory/README.md @@ -1,5 +1,3 @@ -In my mind, the memory package works like the following: - ## Design ### Usage From 5a22d736513124fb03fab1fb792a35739d2dd333 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Fri, 23 Jun 2017 15:09:42 -0700 Subject: [PATCH 45/69] Add variable.h and test --- paddle/framework/CMakeLists.txt | 2 + paddle/framework/ddim_test.cc | 2 - paddle/framework/variable.h | 88 +++++++++++++++++++++++++++++++ paddle/framework/variable_test.cc | 40 ++++++++++++++ 4 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 paddle/framework/variable.h create mode 100644 paddle/framework/variable_test.cc diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 673cfa19ac..e3c3155aa9 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -2,3 +2,5 @@ cc_library(ddim SRCS ddim.cc) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) + +cc_test(variable_test SRCS variable_test.cc) diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc index e5c84d7abe..36eef02370 100644 --- a/paddle/framework/ddim_test.cc +++ b/paddle/framework/ddim_test.cc @@ -1,5 +1,3 @@ -//#include -//#include #include #include diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h new file mode 100644 index 0000000000..249b9b1b37 --- /dev/null +++ b/paddle/framework/variable.h @@ -0,0 +1,88 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#include + +namespace paddle { +namespace framework { + +class Variable { + public: + template + const T& Get() const { + return *static_cast(holder_->Ptr()); + } + + template + T* GetMutable() { + if (holder_ != nullptr && typeid(T) == holder_->Type()) { + return static_cast(holder_->Ptr()); + } else { + return Reset(new T(), DefaultDeleter()); + } + } + + ~Variable() { + if (holder_ != nullptr) delete holder_; + } + + private: + // DefaultDeleter is functor which uses C++'s delete(T*). + template + struct DefaultDeleter { + void operator()(T* ptr) { delete ptr; } + }; + + struct Placeholder { + virtual ~Placeholder() {} + virtual const std::type_info& Type() const = 0; + virtual void* Ptr() const = 0; + }; + + // Placeholder hides type T, so it doesn't appear as a template + // parameter of Variable. + template + struct PlaceholderImpl : public Placeholder { + typedef std::function Deleter; + + PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} + PlaceholderImpl(T* ptr, Deleter d) + : ptr_(ptr), type_(typeid(T)), deleter_(d) {} + + virtual ~PlaceholderImpl() { + deleter_(ptr_); + ptr_ = nullptr; + } + virtual const std::type_info& Type() const { return type_; } + virtual void* Ptr() const { return ptr_; } + + T* ptr_ = nullptr; + const std::type_info& type_; + std::function deleter_ = DefaultDeleter(); + }; + + template + T* Reset(T* allocated, typename PlaceholderImpl::Deleter deleter) { + if (holder_ != nullptr) { + delete holder_; + } + holder_ = new PlaceholderImpl(allocated, deleter); + return allocated; + } + + Placeholder* holder_; // pointers to a PlaceholderImpl object indeed. +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/variable_test.cc b/paddle/framework/variable_test.cc new file mode 100644 index 0000000000..aea03bcf57 --- /dev/null +++ b/paddle/framework/variable_test.cc @@ -0,0 +1,40 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/framework/variable.h" + +TEST(Variable, GetMutable) { + using paddle::framework::Variable; + + struct Tensor { + int content_; + }; + + std::unique_ptr v(new Variable()); + + Tensor* t = v->GetMutable(); + t->content_ = 1234; + + const Tensor& tt = v->Get(); + EXPECT_EQ(1234, tt.content_); + + std::string* s = v->GetMutable(); + *s = "hello"; + + const std::string& ss = v->Get(); + EXPECT_EQ("hello", ss); +} From 58efbf41b32e3495b038978c9a06a0285542cd57 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Fri, 23 Jun 2017 16:13:00 -0700 Subject: [PATCH 46/69] Follow comments from Xu Wei --- paddle/framework/variable.h | 45 +++++++++---------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h index 249b9b1b37..b21c95a1a6 100644 --- a/paddle/framework/variable.h +++ b/paddle/framework/variable.h @@ -12,6 +12,8 @@ */ #pragma once +#include +#include #include namespace paddle { @@ -26,24 +28,14 @@ class Variable { template T* GetMutable() { - if (holder_ != nullptr && typeid(T) == holder_->Type()) { - return static_cast(holder_->Ptr()); - } else { - return Reset(new T(), DefaultDeleter()); + if (holder_ == nullptr || + std::type_index(typeid(T)) != std::type_index(holder_->Type())) { + holder_.reset(new PlaceholderImpl(new T())); } - } - - ~Variable() { - if (holder_ != nullptr) delete holder_; + return static_cast(holder_->Ptr()); } private: - // DefaultDeleter is functor which uses C++'s delete(T*). - template - struct DefaultDeleter { - void operator()(T* ptr) { delete ptr; } - }; - struct Placeholder { virtual ~Placeholder() {} virtual const std::type_info& Type() const = 0; @@ -54,34 +46,17 @@ class Variable { // parameter of Variable. template struct PlaceholderImpl : public Placeholder { - typedef std::function Deleter; - PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} - PlaceholderImpl(T* ptr, Deleter d) - : ptr_(ptr), type_(typeid(T)), deleter_(d) {} - virtual ~PlaceholderImpl() { - deleter_(ptr_); - ptr_ = nullptr; - } virtual const std::type_info& Type() const { return type_; } - virtual void* Ptr() const { return ptr_; } + virtual void* Ptr() const { return static_cast(ptr_.get()); } - T* ptr_ = nullptr; + std::unique_ptr ptr_; const std::type_info& type_; - std::function deleter_ = DefaultDeleter(); }; - template - T* Reset(T* allocated, typename PlaceholderImpl::Deleter deleter) { - if (holder_ != nullptr) { - delete holder_; - } - holder_ = new PlaceholderImpl(allocated, deleter); - return allocated; - } - - Placeholder* holder_; // pointers to a PlaceholderImpl object indeed. + std::unique_ptr + holder_; // pointers to a PlaceholderImpl object indeed. }; } // namespace framework From ac28fad6f37f007cb36c92daff7492a31e5c68b3 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Fri, 23 Jun 2017 16:59:36 -0700 Subject: [PATCH 47/69] Add type assertion in Variable::Get --- paddle/framework/variable.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h index b21c95a1a6..b33e10e682 100644 --- a/paddle/framework/variable.h +++ b/paddle/framework/variable.h @@ -16,6 +16,8 @@ #include #include +#include "paddle/platform/assert.h" + namespace paddle { namespace framework { @@ -23,6 +25,9 @@ class Variable { public: template const T& Get() const { + PADDLE_ASSERT(holder_ != nullptr); + PADDLE_ASSERT(std::type_index(typeid(T)) == + std::type_index(holder_->Type())); return *static_cast(holder_->Ptr()); } From bd4559abbee5413d322b8659929bdb203de6abaf Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Fri, 23 Jun 2017 16:59:48 -0700 Subject: [PATCH 48/69] Add design doc --- paddle/framework/variable.md | 52 ++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 paddle/framework/variable.md diff --git a/paddle/framework/variable.md b/paddle/framework/variable.md new file mode 100644 index 0000000000..f44d5ea46e --- /dev/null +++ b/paddle/framework/variable.md @@ -0,0 +1,52 @@ +# Design Doc: Variable + + +Variable is also known as *blob* in MxNet and Caffe2. It is the input and output type of operators, where a neural network is a graph of operators. + +## Requirements: Lazy Memory Allocation + +For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN. + +To use the minimum amount of memory, we'd like that a variable to allocate memory when it has to, or, lazy memory allocation. Let's take the following example: + +```cpp +Variable vr, v1, v2; + +Tensor* t1 = new Tensor(); +Tensor* t2 = new Tensor(); + +Randomize( + /* malloc */ v1.GetMutable().mutable_data(DDim(100,200)), + /* size */ t1.Size()); + +Randomize( + /* malloc */ v2.GetMutable().mutable_data(DDim(200,300)), + /* size */ t2.Size()); + +Mult( + /*result*/ vr.GetMutable().mutable_data(SizeOfMult(v1, v2)), + /*input1*/ v1.Get().data(), + /*input2*/ v2.Get().data()); +``` + +We see that a variable holds nothing until `Variable::GetMutable()` allocates a tensor and puts it in the variable. Similarly, a tensor gets its memory until `Tensor::mutable_data()`. + +This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code. + + +## Implementation: Type Hiding + +To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time. In other words, `class Variable` cannot be a template `template class Variable`. + +Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member. Instead, we save an interface object `Placeholder`, who can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`. + +But anyway, Variable needs to know `T` so could it `delete(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type. + +We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`. Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter. + +Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`. + + +## Conclusion + +The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid). This combination saves us from definition something like `caffe2::TypeMata`, which takes hundreds of lines of C++ code. From fd8937556f95db4086ce095efa1e83041c896334 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Tue, 20 Jun 2017 23:57:07 +0000 Subject: [PATCH 49/69] Master save and load state from etcd --- go/cmd/master/master.go | 55 ++++++++++-- go/master/client_internal_test.go | 21 ++++- go/master/client_test.go | 21 ++++- go/master/etcd_store.go | 133 ++++++++++++++++++++++++++++ go/master/service.go | 142 +++++++++++++++++++++++------- go/pserver/cclient/cclient.go | 6 +- 6 files changed, 330 insertions(+), 48 deletions(-) create mode 100644 go/master/etcd_store.go diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index 25cd1cafcd..49ad0300b8 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -5,41 +5,80 @@ import ( "net/http" "net/rpc" "strconv" + "strings" + "sync" "time" "github.com/namsral/flag" + log "github.com/sirupsen/logrus" "github.com/PaddlePaddle/Paddle/go/master" ) +type inMemStore struct { + mu sync.Mutex + buf []byte +} + +func (m *inMemStore) Save(b []byte) error { + m.mu.Lock() + defer m.mu.Unlock() + + m.buf = b + return nil +} + +func (m *inMemStore) Load() ([]byte, error) { + m.mu.Lock() + defer m.mu.Unlock() + + return m.buf, nil +} + func main() { port := flag.Int("port", 8080, "port of the master server.") - faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).") + ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.") + endpoints := flag.String("endpoints", "", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.") taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.") taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.") chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.") flag.Parse() - if *faultTolerance { - panic("fault tolernance not implemented.") + if *endpoints == "" { + log.Warningln("-endpoints not set, fault tolerance not be enabled.") + } + + var store master.Store + if *endpoints != "" { + eps := strings.Split(*endpoints, ",") + var err error + store, err = master.NewEtcdStore(eps, master.DefaultLockPath, master.DefaultStatePath, *ttlSec) + if err != nil { + log.Fatal(err) + } + } else { + store = &inMemStore{} + } + s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax) + if err != nil { + log.Fatal(err) } - s := master.NewService(*chunkPerTask, *taskTimeoutDur, *taskTimeoutMax) - err := rpc.Register(s) + err = rpc.Register(s) if err != nil { - panic(err) + log.Fatal(err) } rpc.HandleHTTP() l, err := net.Listen("tcp", ":"+strconv.Itoa(*port)) if err != nil { - panic(err) + log.Fatal(err) } err = http.Serve(l, nil) if err != nil { - panic(err) + log.Fatal(err) } } diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go index 00fcca0e2c..a5b76fe853 100644 --- a/go/master/client_internal_test.go +++ b/go/master/client_internal_test.go @@ -32,6 +32,19 @@ func (a TestAddresser) Address() string { return string(a) } +type myStore struct { + buf []byte +} + +func (m *myStore) Save(b []byte) error { + m.buf = b + return nil +} + +func (m *myStore) Load() ([]byte, error) { + return m.buf, nil +} + func TestGetFinishTask(t *testing.T) { const path = "/tmp/master_client_test_0" @@ -47,9 +60,13 @@ func TestGetFinishTask(t *testing.T) { } go func(l net.Listener) { - s := NewService(chunkPerTask, time.Second, 1) + s, err := NewService(&myStore{}, chunkPerTask, time.Second, 1) + if err != nil { + panic(err) + } + server := rpc.NewServer() - err := server.Register(s) + err = server.Register(s) if err != nil { panic(err) } diff --git a/go/master/client_test.go b/go/master/client_test.go index 2b3f873ecf..ae5f17c2d4 100644 --- a/go/master/client_test.go +++ b/go/master/client_test.go @@ -15,6 +15,19 @@ import ( "github.com/PaddlePaddle/recordio" ) +type myStore struct { + buf []byte +} + +func (m *myStore) Save(b []byte) error { + m.buf = b + return nil +} + +func (m *myStore) Load() ([]byte, error) { + return m.buf, nil +} + func TestNextRecord(t *testing.T) { const ( path = "/tmp/master_client_TestFull" @@ -33,9 +46,13 @@ func TestNextRecord(t *testing.T) { } go func(l net.Listener) { - s := master.NewService(10, time.Second, 1) + s, err := master.NewService(&myStore{}, 10, time.Second, 1) + if err != nil { + panic(err) + } + server := rpc.NewServer() - err := server.Register(s) + err = server.Register(s) if err != nil { panic(err) } diff --git a/go/master/etcd_store.go b/go/master/etcd_store.go new file mode 100644 index 0000000000..ce178370ff --- /dev/null +++ b/go/master/etcd_store.go @@ -0,0 +1,133 @@ +package master + +import ( + "context" + "sync" + + "github.com/coreos/etcd/clientv3" + "github.com/coreos/etcd/clientv3/concurrency" + log "github.com/sirupsen/logrus" +) + +const ( + // DefaultLockPath is the default etcd master lock path. + DefaultLockPath = "/master/lock" + // DefaultStatePath is the default etcd key for master state. + DefaultStatePath = "/master/state" +) + +// EtcdStore is the Store implementation backed by etcd. +type EtcdStore struct { + lockPath string + statePath string + ttlSec int + client *clientv3.Client + + mu sync.Mutex + lock *concurrency.Mutex +} + +// NewEtcdStore creates a new EtcdStore. +func NewEtcdStore(endpoints []string, lockPath, statePath string, ttlSec int) (*EtcdStore, error) { + cli, err := clientv3.New(clientv3.Config{ + Endpoints: endpoints, + DialTimeout: dialTimeout, + }) + if err != nil { + return nil, err + } + + sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec)) + if err != nil { + return nil, err + } + + lock := concurrency.NewMutex(sess, lockPath) + // It's fine for the lock to get stuck, in this case we have + // multiple master servers running (only configured to have + // one master running, but split-brain problem may cuase + // multiple master servers running), and the cluster management + // software will kill one of them. + log.Infof("Trying to acquire lock at %s.", lockPath) + err = lock.Lock(context.TODO()) + if err != nil { + return nil, err + } + log.Infof("Successfully acquired lock at %s.", lockPath) + + e := &EtcdStore{} + e.client = cli + e.lock = lock + e.lockPath = lockPath + e.statePath = statePath + e.ttlSec = ttlSec + return e, nil +} + +// Save saves the state into the etcd. +func (e *EtcdStore) Save(state []byte) error { + e.mu.Lock() + defer e.mu.Unlock() + + ctx := context.TODO() + put := clientv3.OpPut(e.statePath, string(state)) + resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit() + if err != nil { + return err + } + + if !resp.Succeeded { + log.Errorln("No longer owns the lock, trying to lock and save again.") + sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(e.ttlSec)) + if err != nil { + return err + } + + e.lock = concurrency.NewMutex(sess, e.lockPath) + log.Infof("Try to acquire lock at %s.", e.lockPath) + err = e.lock.Lock(context.TODO()) + if err != nil { + return err + } + log.Infof("Successfully acquired lock at %s.", e.lockPath) + return e.Save(state) + } + + return nil +} + +// Load loads the state from etcd. +func (e *EtcdStore) Load() ([]byte, error) { + e.mu.Lock() + ctx := context.TODO() + get := clientv3.OpGet(e.statePath) + + resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit() + if err != nil { + return nil, err + } + + if !resp.Succeeded { + log.Errorln("No longer owns the lock, trying to lock and load again.") + sess, err := concurrency.NewSession(e.client) + if err != nil { + return nil, err + } + + e.lock = concurrency.NewMutex(sess, e.lockPath) + e.lock.Lock(context.TODO()) + e.mu.Unlock() + return e.Load() + } + + kvs := resp.Responses[0].GetResponseRange().Kvs + if len(kvs) == 0 { + // No state exists + e.mu.Unlock() + return nil, nil + } + + state := kvs[0].Value + e.mu.Unlock() + return state, nil +} diff --git a/go/master/service.go b/go/master/service.go index 55e1e2d1a4..d453777b05 100644 --- a/go/master/service.go +++ b/go/master/service.go @@ -1,6 +1,9 @@ package master import ( + "bytes" + "compress/gzip" + "encoding/gob" "errors" "os" "path/filepath" @@ -12,24 +15,54 @@ import ( "github.com/PaddlePaddle/recordio" ) +const ( + dialTimeout = 5 * time.Second +) + +// Store is the interface for save and load the master state. +type Store interface { + Save([]byte) error + Load() ([]byte, error) +} + +// Chunk is a chunk of data consisted of several data instances. +type Chunk struct { + Path string + Index recordio.Index // chunk index +} + +// Task is the basic unit of data instances assigned to trainers. +type Task struct { + ID int + Chunks []Chunk +} + +type taskEntry struct { + Epoch int + NumTimeout int + Task Task +} + +type taskQueues struct { + Todo []taskEntry + Pending map[int]taskEntry // map from task ID to task entry + Done []taskEntry + Failed []Task +} + // Service is the master server service. type Service struct { chunksPerTask int timeoutDur time.Duration timeoutMax int ready chan struct{} + store Store mu sync.Mutex initDone bool taskQueues taskQueues } -// Recover recovers service state from etcd. -func Recover() (*Service, error) { - // TODO(helin): recover from snapshot state from etcd. - return nil, nil -} - func partition(chunks []Chunk, chunksPerTask int) []taskEntry { id := 0 if chunksPerTask <= 0 { @@ -58,7 +91,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry { } // NewService creates a new service. -func NewService(chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service { +func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) { s := &Service{} s.chunksPerTask = chunksPerTask s.timeoutDur = timeoutDur @@ -66,38 +99,81 @@ func NewService(chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Se s.taskQueues = taskQueues{} s.taskQueues.Pending = make(map[int]taskEntry) s.ready = make(chan struct{}) - return s -} + s.store = store + recovered, err := s.recover() + if err != nil { + return nil, err + } -// Chunk is a chunk of data consisted of several data instances. -type Chunk struct { - Path string - Index recordio.Index // chunk index -} + if recovered { + // Recovered. Now the state is already initialized, + // and the master is ready. + s.initDone = true + close(s.ready) + } -// Task is the basic unit of data instances assigned to trainers. -type Task struct { - ID int - Chunks []Chunk + return s, nil } -type taskEntry struct { - Epoch int - NumTimeout int - Task Task -} +// recover recovers service state from etcd. +func (s *Service) recover() (bool, error) { + state, err := s.store.Load() + if err != nil { + return false, err + } -type taskQueues struct { - Todo []taskEntry - Pending map[int]taskEntry // map from task ID to task entry - Done []taskEntry - Failed []Task + if state == nil { + log.Infoln("No state exists, not recovered.") + return false, nil + } + + log.Infof("Loaded snapshot of size: %d bytes.", len(state)) + gr, err := gzip.NewReader(bytes.NewReader(state)) + if err != nil { + return false, err + } + + dec := gob.NewDecoder(gr) + var tqs taskQueues + err = dec.Decode(&tqs) + if err != nil { + return false, err + } + + err = gr.Close() + if err != nil { + // Only close failed, recover actually succeed, so + // just log error. + log.Errorln(err) + } + + s.taskQueues = tqs + return true, nil } -// *must* be called with s.mu being held. +// snapshot *must* be called with s.mu being held. func (s *Service) snapshot() error { - // TODO(helin): snapshot state on etcd. - return nil + // TOOD(helin): etcd request has a size limit, so the snapshot + // size is limited by the max request size. We should either + // divide the snapshot into smaller chunks and save under + // different keys, or configure the request size to be big + // enough: + // https://github.com/coreos/etcd/blob/2f84f3d8d8ed8f9537ab6ffa44a3a1c7eddfa9b1/embed/config.go#L44 + var buf bytes.Buffer + gw := gzip.NewWriter(&buf) + enc := gob.NewEncoder(gw) + err := enc.Encode(s.taskQueues) + if err != nil { + return err + } + err = gw.Close() + if err != nil { + return err + } + + state := buf.Bytes() + log.Infof("Saving snapshot of size: %d bytes.", len(state)) + return s.store.Save(state) } func readChunks(globPaths []string) ([]Chunk, error) { @@ -207,12 +283,12 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { t.NumTimeout++ if t.NumTimeout > s.timeoutMax { - log.Warningf("Task %v timed out %d times, discard.\n", t.Task, t.NumTimeout) + log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout) s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task) return } - log.Warningf("Task %v timed out %d times, retry.\n", t.Task, t.NumTimeout) + log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout) s.taskQueues.Todo = append(s.taskQueues.Todo, t) } } diff --git a/go/pserver/cclient/cclient.go b/go/pserver/cclient/cclient.go index 92a41b7f54..bbaf43d9f1 100644 --- a/go/pserver/cclient/cclient.go +++ b/go/pserver/cclient/cclient.go @@ -133,7 +133,7 @@ func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, if err != nil { if err.Error() == pserver.AlreadyInitialized { - log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.\n", name) + log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name) return C.PSERVER_OK } log.Errorln(err) @@ -200,7 +200,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, for i, p := range ps { pn[i] = p.Name } - log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", ")) + log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", ")) return C.PSERVER_ERROR } @@ -210,7 +210,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, for i, p := range ps { pn[i] = p.Name } - log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", ")) + log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", ")) return C.PSERVER_ERROR } } From 44226853029119e195530e78ff7d0ab883b72dff Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Wed, 21 Jun 2017 18:55:49 +0000 Subject: [PATCH 50/69] put InMemStore into master package --- go/cmd/master/master.go | 23 +---------------------- go/master/client_internal_test.go | 15 +-------------- go/master/client_test.go | 15 +-------------- go/master/inmem_store.go | 28 ++++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 50 deletions(-) create mode 100644 go/master/inmem_store.go diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index 49ad0300b8..48fe2e6f75 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -6,7 +6,6 @@ import ( "net/rpc" "strconv" "strings" - "sync" "time" "github.com/namsral/flag" @@ -15,26 +14,6 @@ import ( "github.com/PaddlePaddle/Paddle/go/master" ) -type inMemStore struct { - mu sync.Mutex - buf []byte -} - -func (m *inMemStore) Save(b []byte) error { - m.mu.Lock() - defer m.mu.Unlock() - - m.buf = b - return nil -} - -func (m *inMemStore) Load() ([]byte, error) { - m.mu.Lock() - defer m.mu.Unlock() - - return m.buf, nil -} - func main() { port := flag.Int("port", 8080, "port of the master server.") @@ -58,7 +37,7 @@ func main() { log.Fatal(err) } } else { - store = &inMemStore{} + store = &master.InMemStore{} } s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax) diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go index a5b76fe853..251225780a 100644 --- a/go/master/client_internal_test.go +++ b/go/master/client_internal_test.go @@ -32,19 +32,6 @@ func (a TestAddresser) Address() string { return string(a) } -type myStore struct { - buf []byte -} - -func (m *myStore) Save(b []byte) error { - m.buf = b - return nil -} - -func (m *myStore) Load() ([]byte, error) { - return m.buf, nil -} - func TestGetFinishTask(t *testing.T) { const path = "/tmp/master_client_test_0" @@ -60,7 +47,7 @@ func TestGetFinishTask(t *testing.T) { } go func(l net.Listener) { - s, err := NewService(&myStore{}, chunkPerTask, time.Second, 1) + s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1) if err != nil { panic(err) } diff --git a/go/master/client_test.go b/go/master/client_test.go index ae5f17c2d4..85a86761c2 100644 --- a/go/master/client_test.go +++ b/go/master/client_test.go @@ -15,19 +15,6 @@ import ( "github.com/PaddlePaddle/recordio" ) -type myStore struct { - buf []byte -} - -func (m *myStore) Save(b []byte) error { - m.buf = b - return nil -} - -func (m *myStore) Load() ([]byte, error) { - return m.buf, nil -} - func TestNextRecord(t *testing.T) { const ( path = "/tmp/master_client_TestFull" @@ -46,7 +33,7 @@ func TestNextRecord(t *testing.T) { } go func(l net.Listener) { - s, err := master.NewService(&myStore{}, 10, time.Second, 1) + s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1) if err != nil { panic(err) } diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go new file mode 100644 index 0000000000..bcd549b20e --- /dev/null +++ b/go/master/inmem_store.go @@ -0,0 +1,28 @@ +package master + +import "sync" + +// InMemStore is an in memory implementation of Store interface. +// +// It does not tolerate the fault that casues the program to crash. +type InMemStore struct { + mu sync.Mutex + buf []byte +} + +// Save saves the state into the in-memory store. +func (m *InMemStore) Save(state []byte) error { + m.mu.Lock() + defer m.mu.Unlock() + + m.buf = state + return nil +} + +// Load loads the state from the in-memory store. +func (m *InMemStore) Load() ([]byte, error) { + m.mu.Lock() + defer m.mu.Unlock() + + return m.buf, nil +} From a4ba403e792fc21b5e032ad6116f1fc00fb4ba8d Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Wed, 21 Jun 2017 19:00:25 +0000 Subject: [PATCH 51/69] add comment for gracefully stop etcd store --- go/master/etcd_store.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/go/master/etcd_store.go b/go/master/etcd_store.go index ce178370ff..d8e95056d5 100644 --- a/go/master/etcd_store.go +++ b/go/master/etcd_store.go @@ -29,6 +29,10 @@ type EtcdStore struct { // NewEtcdStore creates a new EtcdStore. func NewEtcdStore(endpoints []string, lockPath, statePath string, ttlSec int) (*EtcdStore, error) { + // TODO(helin): gracefully shutdown etcd store. Becuase etcd + // store holds a etcd lock, even though the lock will expire + // when the lease timeout, we need to implement graceful + // shutdown to release the lock. cli, err := clientv3.New(clientv3.Config{ Endpoints: endpoints, DialTimeout: dialTimeout, From bf79c9e5bba41dd9f1e122a779e27e3e8dca9ee3 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Wed, 21 Jun 2017 19:02:21 +0000 Subject: [PATCH 52/69] add log when master recovered from saved state. --- go/master/service.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/master/service.go b/go/master/service.go index d453777b05..58e68e7448 100644 --- a/go/master/service.go +++ b/go/master/service.go @@ -110,6 +110,7 @@ func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeou // and the master is ready. s.initDone = true close(s.ready) + log.Info("Master recovered from saved state.") } return s, nil From 42313a3c35637b8d706aa4dbdef65c671e7d6665 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Fri, 23 Jun 2017 22:11:45 +0000 Subject: [PATCH 53/69] rename EtcdStore to Etcd --- go/cmd/master/master.go | 2 +- go/master/etcd_store.go | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index 48fe2e6f75..a62bc4310e 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -32,7 +32,7 @@ func main() { if *endpoints != "" { eps := strings.Split(*endpoints, ",") var err error - store, err = master.NewEtcdStore(eps, master.DefaultLockPath, master.DefaultStatePath, *ttlSec) + store, err = master.NewEtcd(eps, master.DefaultLockPath, master.DefaultStatePath, *ttlSec) if err != nil { log.Fatal(err) } diff --git a/go/master/etcd_store.go b/go/master/etcd_store.go index d8e95056d5..21b3e2cb0f 100644 --- a/go/master/etcd_store.go +++ b/go/master/etcd_store.go @@ -16,8 +16,9 @@ const ( DefaultStatePath = "/master/state" ) -// EtcdStore is the Store implementation backed by etcd. -type EtcdStore struct { +// Etcd is the etcd abstraction that master uses for fault tolerance +// and service registry. +type Etcd struct { lockPath string statePath string ttlSec int @@ -27,8 +28,8 @@ type EtcdStore struct { lock *concurrency.Mutex } -// NewEtcdStore creates a new EtcdStore. -func NewEtcdStore(endpoints []string, lockPath, statePath string, ttlSec int) (*EtcdStore, error) { +// NewEtcd creates a new Etcd. +func NewEtcd(endpoints []string, lockPath, statePath string, ttlSec int) (*Etcd, error) { // TODO(helin): gracefully shutdown etcd store. Becuase etcd // store holds a etcd lock, even though the lock will expire // when the lease timeout, we need to implement graceful @@ -59,7 +60,7 @@ func NewEtcdStore(endpoints []string, lockPath, statePath string, ttlSec int) (* } log.Infof("Successfully acquired lock at %s.", lockPath) - e := &EtcdStore{} + e := &Etcd{} e.client = cli e.lock = lock e.lockPath = lockPath @@ -69,7 +70,7 @@ func NewEtcdStore(endpoints []string, lockPath, statePath string, ttlSec int) (* } // Save saves the state into the etcd. -func (e *EtcdStore) Save(state []byte) error { +func (e *Etcd) Save(state []byte) error { e.mu.Lock() defer e.mu.Unlock() @@ -101,7 +102,7 @@ func (e *EtcdStore) Save(state []byte) error { } // Load loads the state from etcd. -func (e *EtcdStore) Load() ([]byte, error) { +func (e *Etcd) Load() ([]byte, error) { e.mu.Lock() ctx := context.TODO() get := clientv3.OpGet(e.statePath) @@ -119,8 +120,12 @@ func (e *EtcdStore) Load() ([]byte, error) { } e.lock = concurrency.NewMutex(sess, e.lockPath) - e.lock.Lock(context.TODO()) + err = e.lock.Lock(context.TODO()) e.mu.Unlock() + if err != nil { + return nil, err + } + return e.Load() } From 7dad02661f1cd7406eac871354c94cebf4d38345 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Sat, 24 Jun 2017 00:04:26 +0000 Subject: [PATCH 54/69] Master server registers itself to etcd. --- go/cmd/master/master.go | 14 +++- go/master/{etcd_store.go => etcd_client.go} | 90 +++++++++++---------- 2 files changed, 56 insertions(+), 48 deletions(-) rename go/master/{etcd_store.go => etcd_client.go} (56%) diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index a62bc4310e..54fa254863 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -1,6 +1,7 @@ package main import ( + "fmt" "net" "net/http" "net/rpc" @@ -12,13 +13,13 @@ import ( log "github.com/sirupsen/logrus" "github.com/PaddlePaddle/Paddle/go/master" + "github.com/PaddlePaddle/Paddle/go/utils/networkhelper" ) func main() { port := flag.Int("port", 8080, "port of the master server.") - ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.") - endpoints := flag.String("endpoints", "", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.") + endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.") taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.") taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.") chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.") @@ -31,8 +32,13 @@ func main() { var store master.Store if *endpoints != "" { eps := strings.Split(*endpoints, ",") - var err error - store, err = master.NewEtcd(eps, master.DefaultLockPath, master.DefaultStatePath, *ttlSec) + ip, err := networkhelper.GetExternalIP() + if err != nil { + log.Fatal(err) + } + + addr := fmt.Sprintf("%s:%d", ip, *port) + store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec) if err != nil { log.Fatal(err) } diff --git a/go/master/etcd_store.go b/go/master/etcd_client.go similarity index 56% rename from go/master/etcd_store.go rename to go/master/etcd_client.go index 21b3e2cb0f..b7293a7598 100644 --- a/go/master/etcd_store.go +++ b/go/master/etcd_client.go @@ -2,7 +2,7 @@ package master import ( "context" - "sync" + "time" "github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/clientv3/concurrency" @@ -14,22 +14,22 @@ const ( DefaultLockPath = "/master/lock" // DefaultStatePath is the default etcd key for master state. DefaultStatePath = "/master/state" + // DefaultAddrPath is the default etcd key for master address. + DefaultAddrPath = "/master/addr" ) -// Etcd is the etcd abstraction that master uses for fault tolerance +// EtcdClient is the etcd client that master uses for fault tolerance // and service registry. -type Etcd struct { +type EtcdClient struct { lockPath string statePath string - ttlSec int client *clientv3.Client - - mu sync.Mutex - lock *concurrency.Mutex + lock *concurrency.Mutex } -// NewEtcd creates a new Etcd. -func NewEtcd(endpoints []string, lockPath, statePath string, ttlSec int) (*Etcd, error) { +// NewEtcdClient creates a new EtcdClient. +func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) { + log.Debugf("Connecting to etcd at %v", endpoints) // TODO(helin): gracefully shutdown etcd store. Becuase etcd // store holds a etcd lock, even though the lock will expire // when the lease timeout, we need to implement graceful @@ -53,27 +53,35 @@ func NewEtcd(endpoints []string, lockPath, statePath string, ttlSec int) (*Etcd, // one master running, but split-brain problem may cuase // multiple master servers running), and the cluster management // software will kill one of them. - log.Infof("Trying to acquire lock at %s.", lockPath) + log.Debugf("Trying to acquire lock at %s.", lockPath) err = lock.Lock(context.TODO()) if err != nil { return nil, err } - log.Infof("Successfully acquired lock at %s.", lockPath) - - e := &Etcd{} - e.client = cli - e.lock = lock - e.lockPath = lockPath - e.statePath = statePath - e.ttlSec = ttlSec + log.Debugf("Successfully acquired lock at %s.", lockPath) + + put := clientv3.OpPut(addrPath, string(addr)) + resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit() + if err != nil { + return nil, err + } + + if !resp.Succeeded { + log.Fatal("No longer owns the master lock. Exiting.") + } + + e := &EtcdClient{ + lockPath: lockPath, + statePath: statePath, + client: cli, + lock: lock, + } + return e, nil } // Save saves the state into the etcd. -func (e *Etcd) Save(state []byte) error { - e.mu.Lock() - defer e.mu.Unlock() - +func (e *EtcdClient) Save(state []byte) error { ctx := context.TODO() put := clientv3.OpPut(e.statePath, string(state)) resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit() @@ -82,17 +90,21 @@ func (e *Etcd) Save(state []byte) error { } if !resp.Succeeded { - log.Errorln("No longer owns the lock, trying to lock and save again.") - sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(e.ttlSec)) - if err != nil { - return err - } - - e.lock = concurrency.NewMutex(sess, e.lockPath) - log.Infof("Try to acquire lock at %s.", e.lockPath) - err = e.lock.Lock(context.TODO()) + log.Errorln("No longer owns the lock, trying to lock again") + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + err := e.lock.Lock(ctx) + cancel() if err != nil { - return err + // We lost the master lock and can not acquire + // it back, it means some other master is + // already started. We don't want cluster + // managment system to kill the master server + // who is holding the lock and running + // correctly. So the most feasible solution is + // to kill current master server. The current + // state is not saved, but the trainer's RPC + // call will fail, so the trainer will retry. + log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err) } log.Infof("Successfully acquired lock at %s.", e.lockPath) return e.Save(state) @@ -102,8 +114,7 @@ func (e *Etcd) Save(state []byte) error { } // Load loads the state from etcd. -func (e *Etcd) Load() ([]byte, error) { - e.mu.Lock() +func (e *EtcdClient) Load() ([]byte, error) { ctx := context.TODO() get := clientv3.OpGet(e.statePath) @@ -114,14 +125,7 @@ func (e *Etcd) Load() ([]byte, error) { if !resp.Succeeded { log.Errorln("No longer owns the lock, trying to lock and load again.") - sess, err := concurrency.NewSession(e.client) - if err != nil { - return nil, err - } - - e.lock = concurrency.NewMutex(sess, e.lockPath) - err = e.lock.Lock(context.TODO()) - e.mu.Unlock() + err = e.lock.Lock(context.Background()) if err != nil { return nil, err } @@ -132,11 +136,9 @@ func (e *Etcd) Load() ([]byte, error) { kvs := resp.Responses[0].GetResponseRange().Kvs if len(kvs) == 0 { // No state exists - e.mu.Unlock() return nil, nil } state := kvs[0].Value - e.mu.Unlock() return state, nil } From a7865a37768b8d320378c50b517ddd1fdf6db934 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sat, 24 Jun 2017 17:01:48 +0800 Subject: [PATCH 55/69] Fix macos compile Please use `override` not `virtual` in sub-classes. `override` can check if there is a method in `parent` while compiling. --- .../gserver/gradientmachines/NeuralNetwork.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index 514c0759e1..2e839f6405 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -309,35 +309,35 @@ public: void addEvaluator(std::unique_ptr&& evaluator) { evaluators_.emplace_back(std::move(evaluator)); } - virtual void start() { + void start() override { for (auto& evaluator : evaluators_) { evaluator->start(); } } - virtual void finish() { + void finish() override { for (auto& evaluator : evaluators_) { evaluator->finish(); } } - virtual void eval(const NeuralNetwork& nn) override { + void eval(const NeuralNetwork& nn) override { for (auto& evaluator : evaluators_) { evaluator->eval(nn); } } - virtual real evalImp(std::vector& arguments) { + real evalImp(std::vector& arguments) override { (void)arguments; return -1; } - virtual void printStats(std::ostream& os) const { + void printStats(std::ostream& os) const override { for (auto& evaluator : evaluators_) { evaluator->printStats(os); os << ' '; } } - virtual void distributeEval(ParameterClient2* client) { + void distributeEval(ParameterClient2* client) override { for (auto& evaluator : evaluators_) { evaluator->distributeEval(client); } @@ -352,7 +352,7 @@ public: * @brief getNames will return all inside evaluators' names. * @param names [out]: return names. */ - void getNames(std::vector* names) { + void getNames(std::vector* names) override { for (auto& eval : evaluators_) { eval->getNames(names); } @@ -361,7 +361,7 @@ public: /** * @brief getValue could get all inside evaluators' value. */ - real getValue(const std::string& name, Error* err) const { + real getValue(const std::string& name, Error* err) const override { return this->getMethodHelper( name, err, [&name, err](const std::unique_ptr& eval) { return eval->getValue(name, err); @@ -371,7 +371,7 @@ public: /** * @brief getType could get all inside evaluators' type. */ - std::string getType(const std::string& name, Error* err) const { + std::string getType(const std::string& name, Error* err) const override { return this->getMethodHelper( name, err, [&name, err](const std::unique_ptr& eval) { return eval->getType(name, err); From 8c735c8b092c9f21161bf7b8f8deb8b2f2047184 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sat, 24 Jun 2017 18:37:48 +0800 Subject: [PATCH 56/69] add dependy --- python/setup.py.in | 1 + 1 file changed, 1 insertion(+) diff --git a/python/setup.py.in b/python/setup.py.in index 2e22f640cb..86fc0fc5c0 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -13,6 +13,7 @@ packages=['paddle', setup_requires=["requests", "numpy", "protobuf==3.1", + "recordio", "matplotlib", "rarfile"] From b359d5c5cdffb05679245886dbb3193981a4d442 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sat, 24 Jun 2017 18:48:02 +0800 Subject: [PATCH 57/69] restore creator.py --- python/paddle/v2/reader/creator.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 994062096f..07142056f8 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could be used in user program. """ -__all__ = ['np_array', 'text_file', "RecordIO"] +__all__ = ['np_array', 'text_file'] def np_array(x): @@ -55,22 +55,3 @@ def text_file(path): f.close() return reader - - -def RecordIO(path): - """ - Creates a data reader that outputs record one one by one from given recordio file - :path: path of recordio file - :returns: data reader of recordio file - """ - - def reader(): - f = recordio.reader(path) - while True: - r = f.read() - if r is None: - break - yield r - f.close() - - return reader From 90c909ac7c0ba7155151b3af6aea655e0cd8ce98 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sat, 24 Jun 2017 18:51:03 +0800 Subject: [PATCH 58/69] restore creator_test.py --- python/paddle/v2/reader/tests/creator_test.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py index dd84fbb002..9f8d7133b8 100644 --- a/python/paddle/v2/reader/tests/creator_test.py +++ b/python/paddle/v2/reader/tests/creator_test.py @@ -36,14 +36,5 @@ class TestTextFile(unittest.TestCase): self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) -class TestRecordIO(unittest.TestCase): - def test_RecordIO(self): - path = os.path.join( - os.path.dirname(__file__), "test_recordio_creator.dat") - reader = paddle.v2.reader.creator.RecordIO(path) - for idx, r in enumerate(reader()): - self.assertSequenceEqual(r, str(idx)) - - if __name__ == '__main__': unittest.main() From ae79b9ac1ccdf99713241c2e2b9f5c6bddcc0193 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sat, 24 Jun 2017 18:52:09 +0800 Subject: [PATCH 59/69] restore --- .../v2/reader/tests/test_recordio_creator.dat | Bin 88 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 python/paddle/v2/reader/tests/test_recordio_creator.dat diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat deleted file mode 100644 index 17aa89b6796184407e83246d3f342a55a66b4a69..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88 zcmZQ!W@2QOHw Date: Sat, 24 Jun 2017 19:53:28 +0800 Subject: [PATCH 60/69] set ps_desired when pserver init --- go/cmd/pserver/pserver.go | 3 ++- go/pserver/service.go | 28 +++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index fe1fe5f6f0..6c85b1804b 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -18,6 +18,7 @@ func main() { etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", "comma separated endpoint string for pserver to connect to etcd") etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls") + numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job") logLevel := flag.String("log-level", "info", "log level, possible values: debug, info, warning, error, fatal, panic") flag.Parse() @@ -29,7 +30,7 @@ func main() { log.SetLevel(level) timeout := time.Second * time.Duration((*etcdTimeout)) - s, err := pserver.NewService(*etcdEndpoint, timeout) + s, err := pserver.NewService(*etcdEndpoint, *numPservers, timeout) if err != nil { panic(err) } diff --git a/go/pserver/service.go b/go/pserver/service.go index 7e2b841dd8..f966595fdc 100644 --- a/go/pserver/service.go +++ b/go/pserver/service.go @@ -73,7 +73,7 @@ type Service struct { // NewService creates a new service, will bypass etcd registration if no // endpoints specified. -func NewService(endpoints string, timeout time.Duration) (*Service, error) { +func NewService(endpoints string, numPservers int, timeout time.Duration) (*Service, error) { s := &Service{opt: newOptimizer(sgd, 0.005)} s.paramMap = make(map[string]Parameter) s.initialized = make(chan struct{}) @@ -103,6 +103,22 @@ func NewService(endpoints string, timeout time.Duration) (*Service, error) { log.Debugf("inited client to %s", s.etcdEndpoints) break } + // init /ps_desired using transaction, for multiple pservers may want to write + // it at the same time. + for { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + _, err := s.initDesiredPsercers(ctx, numPservers) + cancel() + if err != nil { + log.Warn(err) + time.Sleep(s.etcdTimeout) + continue + } + break + } + // TODO: when implementing extending or reducing pservers, /ps_desired is + // changed, then we need to watch /ps_desired node for events. For now, just + // write once when init and read from it. // wait and set s.desired init value for { ctx, cancel := context.WithTimeout(context.Background(), time.Second) @@ -141,6 +157,16 @@ func NewService(endpoints string, timeout time.Duration) (*Service, error) { return s, nil } +func (s *Service) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) { + return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error { + dsStr := c.Get(PsDesired) + if dsStr == "" { + c.Put(PsDesired, strconv.Itoa(numPservers)) + } + return nil + }, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads)) +} + // registerPserverEtcd registers pserver node on etcd using transaction. func (s *Service) registerPserverEtcd(ctx context.Context) (*clientv3.TxnResponse, error) { return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error { From ab2550c6400bce5d2596f5bff8629ef67ed195b8 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sun, 25 Jun 2017 15:44:55 -0700 Subject: [PATCH 61/69] Update design --- paddle/memory/README.md | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/paddle/memory/README.md b/paddle/memory/README.md index fd32d07ef4..e5f7880e4c 100644 --- a/paddle/memory/README.md +++ b/paddle/memory/README.md @@ -31,7 +31,7 @@ In `paddle/memory/memory.h` we have: namespace memory { template void* Alloc(Place, size_t); template void Free(Place, void*); -template void Used(Place); +template size_t Used(Place); } // namespace memory ``` @@ -39,7 +39,7 @@ These function templates have specializations on either `platform::CPUPlace` or ```cpp template<> -void Alloc(CPUPlace p, size_t size) { +void* Alloc(CPUPlace p, size_t size) { return GetCPUBuddyAllocator()->Alloc(size); } ``` @@ -102,15 +102,11 @@ class BuddyAllocator { }; ``` -#### System Allocators - -The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`. A system allocator holds information about a device, including the amount of memory has been allocated, so we can call +Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace. -- `GPUAllocator::Used()` and -- `CPUAllocator::Used()` - -to get the amount of memory that has been allocated so far. +#### System Allocators +The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`. ## Justification From f403096aa4e03475f2201f6c444ce86f2e13a1a8 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sun, 25 Jun 2017 17:59:23 -0700 Subject: [PATCH 62/69] Move directory third_party into /build --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5d7f2c7ec..3c719d35ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,7 +71,7 @@ if(ANDROID) "Disable RDMA when cross-compiling for Android" FORCE) endif(ANDROID) -set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING +set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") if (WITH_C_API AND WITH_PYTHON) From 9dd211f6c69066b93d7e81dd30e98bb12091a014 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Jun 2017 11:38:21 +0800 Subject: [PATCH 63/69] Add Third Party Path back to TravisCI cache. --- .travis.yml | 5 ++++- paddle/scripts/travis/build_doc.sh | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2c46da71e7..a57f1cd84b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ cache: directories: - $HOME/.ccache - $HOME/.cache/pip + - $HOME/third_party sudo: required dist: trusty os: @@ -41,7 +42,9 @@ before_install: - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - - paddle/scripts/travis/$JOB.sh + - | + timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout + RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; notifications: email: on_success: change diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 88264d8c26..193c291d43 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -6,12 +6,13 @@ mkdir -p $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build # Compile Documentation only. -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF -DTHIRD_PARTY_PATH=$HOME/third_party + mkdir output make -j `nproc` find .. -name '*whl' | xargs pip install # install all wheels. rm -rf * -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON -DTHIRD_PARTY_PATH=$HOME/third_party make -j `nproc` paddle_docs paddle_docs_cn # check websites for broken links From be54d38a1f2e1bcf8a6fb40576a4712fbf05ca77 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Jun 2017 11:42:03 +0800 Subject: [PATCH 64/69] Cache Paddle Default ThirdParty Dir --- .travis.yml | 2 +- paddle/scripts/travis/build_doc.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index a57f1cd84b..64961adcf2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ cache: directories: - $HOME/.ccache - $HOME/.cache/pip - - $HOME/third_party + - $TRAVIS_BUILD_DIR/build/third_party sudo: required dist: trusty os: diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 193c291d43..a44bd35357 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -6,13 +6,13 @@ mkdir -p $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build # Compile Documentation only. -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF -DTHIRD_PARTY_PATH=$HOME/third_party +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF mkdir output make -j `nproc` find .. -name '*whl' | xargs pip install # install all wheels. rm -rf * -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON -DTHIRD_PARTY_PATH=$HOME/third_party +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON make -j `nproc` paddle_docs paddle_docs_cn # check websites for broken links From d76d2febbfd55243f471ea3521337d81e10f5971 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Jun 2017 11:52:50 +0800 Subject: [PATCH 65/69] Adding platform/must_check.h __must_check is a macro mark of function return value. It let developer must check the return value is legal or not. --- paddle/platform/CMakeLists.txt | 1 + .../{utils/Compiler.h => platform/must_check.h} | 17 +++++------------ paddle/platform/must_check_test.cc | 10 ++++++++++ paddle/utils/Error.h | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) rename paddle/{utils/Compiler.h => platform/must_check.h} (78%) create mode 100644 paddle/platform/must_check_test.cc diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index c7d7b14518..7abe2ab89e 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -2,3 +2,4 @@ nv_test(cuda_test SRCS cuda_test.cu) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) +cc_test(must_check_test SRCS must_check_test.cc) diff --git a/paddle/utils/Compiler.h b/paddle/platform/must_check.h similarity index 78% rename from paddle/utils/Compiler.h rename to paddle/platform/must_check.h index cebca5a2a3..4fcc62afc0 100644 --- a/paddle/utils/Compiler.h +++ b/paddle/platform/must_check.h @@ -10,24 +10,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -/** - * This header defines some useful attribute by each compiler. It is the - * abstract layer of compilers. - */ -#ifdef __GNUC__ -#define GCC_VERSION \ - (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -#else -#define GCC_VERSION -#endif - /** * __must_check macro. It make the function's return value must be used, * otherwise it will raise a compile warning. And also Paddle treat all compile * warnings as errors. */ -#if GCC_VERSION >= 30400 +#ifdef __GNUC__ +#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400 #define __must_check __attribute__((warn_unused_result)) #else #define __must_check #endif +#else +#define __must_check +#endif diff --git a/paddle/platform/must_check_test.cc b/paddle/platform/must_check_test.cc new file mode 100644 index 0000000000..6ee3ea49ac --- /dev/null +++ b/paddle/platform/must_check_test.cc @@ -0,0 +1,10 @@ +#include +#include + +int __must_check SomeFunctionMustCheck() { return 0; } + +TEST(MustCheck, all) { + // This line should not be compiled, because the + // return value of SomeFunctionMustCheck marked as __must_check + // SomeFunctionMustCheck(); +} \ No newline at end of file diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h index cda1b5c37d..f3d535c69c 100644 --- a/paddle/utils/Error.h +++ b/paddle/utils/Error.h @@ -19,7 +19,7 @@ limitations under the License. */ #include #include #include -#include "Compiler.h" +#include "paddle/platform/must_check.h" namespace paddle { From 97270b9f270fc7711f08b3ad80a4e17612d4606c Mon Sep 17 00:00:00 2001 From: root Date: Mon, 26 Jun 2017 19:46:20 +0800 Subject: [PATCH 66/69] add convert function --- python/paddle/v2/dataset/cifar.py | 29 +++++++++++++++----- python/paddle/v2/dataset/common.py | 5 +++- python/paddle/v2/dataset/conll05.py | 36 +++++++++++++++++-------- python/paddle/v2/dataset/imdb.py | 11 ++++++++ python/paddle/v2/dataset/imikolov.py | 14 ++++++++-- python/paddle/v2/dataset/mnist.py | 8 ++++++ python/paddle/v2/dataset/movielens.py | 14 +++++++--- python/paddle/v2/dataset/sentiment.py | 20 ++++++++++---- python/paddle/v2/dataset/uci_housing.py | 18 +++++++++---- python/paddle/v2/dataset/wmt14.py | 28 +++++++++++++------ 10 files changed, 141 insertions(+), 42 deletions(-) diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 81af0a8e66..95984d980d 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -31,7 +31,7 @@ images per class. import cPickle import itertools import numpy -from common import download +import paddle.v2.dataset.common import tarfile __all__ = ['train100', 'test100', 'train10', 'test10'] @@ -75,7 +75,8 @@ def train100(): :rtype: callable """ return reader_creator( - download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train') + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + 'train') def test100(): @@ -88,7 +89,9 @@ def test100(): :return: Test reader creator. :rtype: callable """ - return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test') + return reader_creator( + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + 'test') def train10(): @@ -102,7 +105,8 @@ def train10(): :rtype: callable """ return reader_creator( - download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch') + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'data_batch') def test10(): @@ -116,9 +120,20 @@ def test10(): :rtype: callable """ return reader_creator( - download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch') + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'test_batch') def fetch(): - download(CIFAR10_URL, 'cifar', CIFAR10_MD5) - download(CIFAR100_URL, 'cifar', CIFAR100_MD5) + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100") + paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100") + paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10") + paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10") diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 72894c24b1..4a2eb59c34 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -23,7 +23,10 @@ import paddle.v2.dataset import cPickle import glob -__all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader'] +__all__ = [ + 'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader', + 'convert' +] DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index 12d648bf65..d4c2276b1b 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -23,7 +23,7 @@ to initialize SRL model. import tarfile import gzip import itertools -from common import download +import paddle.v2.dataset.common __all__ = ['test, get_dict', 'get_embedding'] @@ -182,9 +182,15 @@ def get_dict(): """ Get the word, verb and label dictionary of Wikipedia corpus. """ - word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) - verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) - label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) + word_dict = load_dict( + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', + WORDDICT_MD5)) + verb_dict = load_dict( + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', + VERBDICT_MD5)) + label_dict = load_dict( + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', + TRGDICT_MD5)) return word_dict, verb_dict, label_dict @@ -192,7 +198,7 @@ def get_embedding(): """ Get the trained word vector based on Wikipedia corpus. """ - return download(EMB_URL, 'conll05st', EMB_MD5) + return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) def test(): @@ -209,15 +215,23 @@ def test(): """ word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( - download(DATA_URL, 'conll05st', DATA_MD5), + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5), words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') return reader_creator(reader, word_dict, verb_dict, label_dict) def fetch(): - download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) - download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) - download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) - download(EMB_URL, 'conll05st', EMB_MD5) - download(DATA_URL, 'conll05st', DATA_MD5) + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) + paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) + + +def convert(): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train") + paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test") diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 5dc5abfe53..d939bc3065 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -166,3 +166,14 @@ def word_dict(): def fetch(): paddle.v2.dataset.common.download(URL, 'imdb', MD5) + + +def convert(): + """ + Converts dataset to recordio format + """ + word_dict = ds.imdb.word_dict() + paddle.v2.dataset.common.convert(path, lambda: train(word_dict), 10, + "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(word_dict), 10, + "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index dd3a4552d2..034f58c2c8 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -18,7 +18,7 @@ This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set into paddle reader creators. """ -import paddle.v2.dataset.common +import paddle.v2.dataset.common as common import collections import tarfile @@ -145,4 +145,14 @@ def test(word_idx, n, data_type=DataType.NGRAM): def fetch(): - paddle.v2.dataset.common.download(URL, "imikolov", MD5) + common.download(URL, "imikolov", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + N = 5 + word_dict = build_dict() + common.convert(path, train(word_dict, N), 10, "imikolov_train") + common.convert(path, test(word_dict, N), 10, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index 435556b292..92d7f69b8d 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -113,3 +113,11 @@ def fetch(): paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 10, "minist_train") + paddle.v2.dataset.common.convert(path, test(), 10, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index 837a859126..fb906cd4b6 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -23,7 +23,7 @@ set and test set into paddle reader creators. """ import zipfile -from common import download +import paddle.v2.dataset.common import re import random import functools @@ -99,7 +99,7 @@ USER_INFO = None def __initialize_meta_info__(): - fn = download(URL, "movielens", MD5) + fn = paddle.v2.dataset.common.download(URL, "movielens", MD5) global MOVIE_INFO if MOVIE_INFO is None: pattern = re.compile(r'^(.*)\((\d+)\)$') @@ -246,7 +246,15 @@ def unittest(): def fetch(): - download(URL, "movielens", MD5) + paddle.v2.dataset.common.download(URL, "movielens", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train") + paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test") if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index 4dd34e7383..89683c2063 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -26,7 +26,7 @@ from itertools import chain import nltk from nltk.corpus import movie_reviews -import common +import paddle.v2.dataset.common __all__ = ['train', 'test', 'get_word_dict'] NUM_TRAINING_INSTANCES = 1600 @@ -39,12 +39,13 @@ def download_data_if_not_yet(): """ try: # make sure that nltk can find the data - if common.DATA_HOME not in nltk.data.path: - nltk.data.path.append(common.DATA_HOME) + if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path: + nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME) movie_reviews.categories() except LookupError: print "Downloading movie_reviews data set, please wait....." - nltk.download('movie_reviews', download_dir=common.DATA_HOME) + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) print "Download data set success....." print "Path is " + nltk.data.find('corpora/movie_reviews').path @@ -128,4 +129,13 @@ def test(): def fetch(): - nltk.download('movie_reviews', download_dir=common.DATA_HOME) + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train") + paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test") diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index 3469fd9ce1..9e15000c02 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -14,14 +14,14 @@ """ UCI Housing dataset. -This module will download dataset from +This module will paddle.v2.dataset.common.download dataset from https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and parse training set and test set into paddle reader creators. """ import numpy as np import os -from common import download +import paddle.v2.dataset.common __all__ = ['train', 'test'] @@ -82,7 +82,7 @@ def train(): :rtype: callable """ global UCI_TRAIN_DATA - load_data(download(URL, 'uci_housing', MD5)) + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) def reader(): for d in UCI_TRAIN_DATA: @@ -102,7 +102,7 @@ def test(): :rtype: callable """ global UCI_TEST_DATA - load_data(download(URL, 'uci_housing', MD5)) + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) def reader(): for d in UCI_TEST_DATA: @@ -112,4 +112,12 @@ def test(): def fetch(): - download(URL, 'uci_housing', MD5) + paddle.v2.dataset.common.download(URL, 'uci_housing', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train") + paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 0902f87741..f29c9275f0 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -22,7 +22,7 @@ parse training set and test set into paddle reader creators. import tarfile import gzip -from paddle.v2.dataset.common import download +import paddle.v2.dataset.common from paddle.v2.parameters import Parameters __all__ = ['train', 'test', 'build_dict'] @@ -115,7 +115,8 @@ def train(dict_size): :rtype: callable """ return reader_creator( - download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size) + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'train/train', dict_size) def test(dict_size): @@ -130,16 +131,18 @@ def test(dict_size): :rtype: callable """ return reader_creator( - download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size) + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'test/test', dict_size) def gen(dict_size): return reader_creator( - download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'gen/gen', dict_size) + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'gen/gen', dict_size) def model(): - tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL) + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) with gzip.open(tar_file, 'r') as f: parameters = Parameters.from_tar(f) return parameters @@ -148,7 +151,7 @@ def model(): def get_dict(dict_size, reverse=True): # if reverse = False, return dict = {'a':'001', 'b':'002', ...} # else reverse = true, return dict = {'001':'a', '002':'b', ...} - tar_file = download(URL_TRAIN, 'wmt14', MD5_TRAIN) + tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) src_dict, trg_dict = __read_to_dict__(tar_file, dict_size) if reverse: src_dict = {v: k for k, v in src_dict.items()} @@ -157,5 +160,14 @@ def get_dict(dict_size, reverse=True): def fetch(): - download(URL_TRAIN, 'wmt14', MD5_TRAIN) - download(URL_MODEL, 'wmt14', MD5_MODEL) + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + + +def convert(path): + """ + Converts dataset to recordio format + """ + dict_size = 30000 + paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train") + paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test") From e915aa9cf1784a82dce2b8cd0b77486c1219f6c3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 26 Jun 2017 20:27:07 +0800 Subject: [PATCH 67/69] fix bugs --- python/paddle/v2/dataset/cifar.py | 2 +- python/paddle/v2/dataset/conll05.py | 4 ++-- python/paddle/v2/dataset/imdb.py | 12 +++++------- python/paddle/v2/dataset/imikolov.py | 12 +++++++----- python/paddle/v2/dataset/mnist.py | 2 +- python/paddle/v2/dataset/movielens.py | 3 ++- python/paddle/v2/dataset/sentiment.py | 2 +- python/paddle/v2/dataset/uci_housing.py | 2 +- python/paddle/v2/dataset/wmt14.py | 2 +- 9 files changed, 21 insertions(+), 20 deletions(-) diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 95984d980d..f885b2834e 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -34,7 +34,7 @@ import numpy import paddle.v2.dataset.common import tarfile -__all__ = ['train100', 'test100', 'train10', 'test10'] +__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/' CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz' diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index d4c2276b1b..f8aae52e7c 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -25,7 +25,7 @@ import gzip import itertools import paddle.v2.dataset.common -__all__ = ['test, get_dict', 'get_embedding'] +__all__ = ['test, get_dict', 'get_embedding', 'convert'] DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_MD5 = '387719152ae52d60422c016e92a742fc' @@ -229,7 +229,7 @@ def fetch(): paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) -def convert(): +def convert(path): """ Converts dataset to recordio format """ diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index d939bc3065..c0ec5992e0 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -28,7 +28,7 @@ import re import string import threading -__all__ = ['build_dict', 'train', 'test'] +__all__ = ['build_dict', 'train', 'test', 'convert'] URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' MD5 = '7c2ac02c03563afcf9b574c7e56c153a' @@ -168,12 +168,10 @@ def fetch(): paddle.v2.dataset.common.download(URL, 'imdb', MD5) -def convert(): +def convert(path): """ Converts dataset to recordio format """ - word_dict = ds.imdb.word_dict() - paddle.v2.dataset.common.convert(path, lambda: train(word_dict), 10, - "imdb_train") - paddle.v2.dataset.common.convert(path, lambda: test(word_dict), 10, - "imdb_test") + w = word_dict() + paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 034f58c2c8..b18ee8e9ba 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -18,11 +18,11 @@ This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set into paddle reader creators. """ -import paddle.v2.dataset.common as common +import paddle.v2.dataset.common import collections import tarfile -__all__ = ['train', 'test', 'build_dict'] +__all__ = ['train', 'test', 'build_dict', 'convert'] URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' MD5 = '30177ea32e27c525793142b6bf2c8e2d' @@ -145,7 +145,7 @@ def test(word_idx, n, data_type=DataType.NGRAM): def fetch(): - common.download(URL, "imikolov", MD5) + paddle.v2.dataset.common.download(URL, "imikolov", MD5) def convert(path): @@ -154,5 +154,7 @@ def convert(path): """ N = 5 word_dict = build_dict() - common.convert(path, train(word_dict, N), 10, "imikolov_train") - common.convert(path, test(word_dict, N), 10, "imikolov_test") + paddle.v2.dataset.common.convert(path, + train(word_dict, N), 10, "imikolov_train") + paddle.v2.dataset.common.convert(path, + test(word_dict, N), 10, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index 92d7f69b8d..ea5891f4f3 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -21,7 +21,7 @@ import paddle.v2.dataset.common import subprocess import numpy import platform -__all__ = ['train', 'test'] +__all__ = ['train', 'test', 'convert'] URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/' TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz' diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index fb906cd4b6..d9372d422a 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -30,7 +30,8 @@ import functools __all__ = [ 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', - 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info' + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info', + 'convert' ] age_table = [1, 18, 25, 35, 45, 50, 56] diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index 89683c2063..e33f120c87 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -28,7 +28,7 @@ from nltk.corpus import movie_reviews import paddle.v2.dataset.common -__all__ = ['train', 'test', 'get_word_dict'] +__all__ = ['train', 'test', 'get_word_dict', 'convert'] NUM_TRAINING_INSTANCES = 1600 NUM_TOTAL_INSTANCES = 2000 diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index 9e15000c02..c715ea9681 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -29,7 +29,7 @@ URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing MD5 = 'd4accdce7a25600298819f8e28e8d593' feature_names = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', - 'PTRATIO', 'B', 'LSTAT' + 'PTRATIO', 'B', 'LSTAT', 'convert' ] UCI_TRAIN_DATA = None diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index f29c9275f0..e1dc4f4c30 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -25,7 +25,7 @@ import gzip import paddle.v2.dataset.common from paddle.v2.parameters import Parameters -__all__ = ['train', 'test', 'build_dict'] +__all__ = ['train', 'test', 'build_dict', 'convert'] URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' From a243bdfbcf2e2ad718d2140b66964187b4deab9e Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 26 Jun 2017 20:38:18 +0800 Subject: [PATCH 68/69] rm not need --- python/paddle/v2/reader/creator.py | 21 +------------------ python/paddle/v2/reader/tests/creator_test.py | 11 ---------- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 994062096f..07142056f8 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could be used in user program. """ -__all__ = ['np_array', 'text_file', "RecordIO"] +__all__ = ['np_array', 'text_file'] def np_array(x): @@ -55,22 +55,3 @@ def text_file(path): f.close() return reader - - -def RecordIO(path): - """ - Creates a data reader that outputs record one one by one from given recordio file - :path: path of recordio file - :returns: data reader of recordio file - """ - - def reader(): - f = recordio.reader(path) - while True: - r = f.read() - if r is None: - break - yield r - f.close() - - return reader diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py index dd84fbb002..359f3eeefb 100644 --- a/python/paddle/v2/reader/tests/creator_test.py +++ b/python/paddle/v2/reader/tests/creator_test.py @@ -13,9 +13,7 @@ # limitations under the License. import os import unittest - import numpy as np - import paddle.v2.reader.creator @@ -36,14 +34,5 @@ class TestTextFile(unittest.TestCase): self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) -class TestRecordIO(unittest.TestCase): - def test_RecordIO(self): - path = os.path.join( - os.path.dirname(__file__), "test_recordio_creator.dat") - reader = paddle.v2.reader.creator.RecordIO(path) - for idx, r in enumerate(reader()): - self.assertSequenceEqual(r, str(idx)) - - if __name__ == '__main__': unittest.main() From b9d015cbc4975f9513f106356e8f7848737cf0f9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 26 Jun 2017 20:40:12 +0800 Subject: [PATCH 69/69] rm not need --- .../v2/reader/tests/test_recordio_creator.dat | Bin 88 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 python/paddle/v2/reader/tests/test_recordio_creator.dat diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat deleted file mode 100644 index 17aa89b6796184407e83246d3f342a55a66b4a69..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88 zcmZQ!W@2QOHw