diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 8b71f73c36..b36c1d1339 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -24,6 +24,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) +cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) + cc_test(variable_test SRCS variable_test.cc) cc_library(threadpool SRCS threadpool.cc DEPS enforce) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 9a232b0843..2a88e5a929 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" +#include "paddle/framework/reader.h" #include "paddle/platform/place.h" #include "paddle/platform/profiler.h" @@ -52,11 +53,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { var->GetMutable(); } else if (var_type == proto::VarDesc::PLACE_LIST) { var->GetMutable(); + } else if (var_type == proto::VarDesc::READER) { + var->GetMutable(); } else { PADDLE_THROW( "Variable type %d is not in " - "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE," - " PLACE_LIST]", + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER]", var_type); } } diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto index f65ccae6e6..d7be1a7352 100644 --- a/paddle/framework/framework.proto +++ b/paddle/framework/framework.proto @@ -116,7 +116,7 @@ message LoDTensorArrayDesc { optional int32 lod_level = 2 [ default = 0 ]; } -message Reader { repeated LoDTensorDesc lod_tensor = 1; } +message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } message VarDesc { enum VarType { @@ -136,7 +136,7 @@ message VarDesc { optional LoDTensorDesc lod_tensor = 4; optional TensorDesc selected_rows = 5; optional LoDTensorArrayDesc tensor_array = 6; - optional Reader reader = 7; + optional ReaderDesc reader = 7; } message BlockDesc { diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index ad361852ec..ea40287502 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -72,6 +72,11 @@ class CompileTimeInferShapeContext : public InferShapeContext { void SetDim(const std::string &name, const DDim &dim) override; + std::vector GetRepeatedDims(const std::string &name) const override; + + void SetRepeatedDims(const std::string &name, + const std::vector &dims) override; + const OpDesc &op_; const BlockDesc &block_; }; @@ -457,23 +462,48 @@ const std::vector &CompileTimeInferShapeContext::Outputs( DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { auto var = block_.FindVarRecursive(name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + DDim res; try { auto shape = var->GetShape(); - if (shape.empty()) { - return framework::make_ddim({0UL}); - } else { - return framework::make_ddim(var->GetShape()); - } + res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape); } catch (...) { VLOG(5) << "GetDim of variable " << name << " error"; std::rethrow_exception(std::current_exception()); } + return res; +} + +std::vector CompileTimeInferShapeContext::GetRepeatedDims( + const std::string &name) const { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + std::vector res; + try { + auto shapes = var->GetShapes(); + for (const auto &s : shapes) { + res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s)); + } + } catch (...) { + VLOG(5) << "GetRepeatedDim of variable " << name << " error."; + std::rethrow_exception(std::current_exception()); + } + return res; } void CompileTimeInferShapeContext::SetDim(const std::string &name, const DDim &dim) { - block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim)); + block_.FindVarRecursive(name)->SetShape(vectorize(dim)); } + +void CompileTimeInferShapeContext::SetRepeatedDims( + const std::string &name, const std::vector &dims) { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + std::vector> dim_vec(dims.size()); + std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize); + var->SetShapes(dim_vec); +} + bool CompileTimeInferShapeContext::IsRuntime() const { return false; } proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType( diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 81fa8cf477..52387aabd9 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -320,8 +320,8 @@ class RuntimeInferShapeContext : public InferShapeContext { if (length == 0) { return false; } - PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs", - name); + PADDLE_ENFORCE_EQ(length, 1UL, + "Input %s should not have more than one inputs", name); auto ipt = ins[0]; auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); return var != nullptr; @@ -333,8 +333,8 @@ class RuntimeInferShapeContext : public InferShapeContext { if (length == 0) { return false; } - PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs", - name); + PADDLE_ENFORCE_EQ(length, 1UL, + "Output %s should not have more than one inputs", name); auto ipt = outs[0]; auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); return var != nullptr; @@ -421,8 +421,22 @@ class RuntimeInferShapeContext : public InferShapeContext { } else if (var->IsType()) { return var->Get().GetCompleteDims(); } else { - PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", - name, var->Type().name()); + PADDLE_THROW( + "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } + } + + std::vector GetRepeatedDims(const std::string& name) const override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + return var->Get().shapes(); + } else { + PADDLE_THROW( + "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); } } @@ -438,6 +452,19 @@ class RuntimeInferShapeContext : public InferShapeContext { } } + void SetRepeatedDims(const std::string& name, + const std::vector& dims) override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + var->GetMutable()->set_shapes(dims); + } else { + PADDLE_THROW( + "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } + } + proto::VarDesc::VarType GetVarType(const std::string& name) const override { auto* var = scope_.FindVar(name); return ToVarType(var->Type()); diff --git a/paddle/framework/reader.cc b/paddle/framework/reader.cc new file mode 100644 index 0000000000..928b661aaa --- /dev/null +++ b/paddle/framework/reader.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/framework/reader.h" + +namespace paddle { +namespace framework { + +DDim ReaderBase::shape(size_t idx) const { + PADDLE_ENFORCE_LT( + idx, shapes_.size(), + "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx, + shapes_.size()); + return shapes_[idx]; +} + +void ShuffleReader::ReadNext(std::vector* out) { + if (iteration_pos_ >= buffer_.size()) { + // Reload buffer with new data + buffer_.clear(); + buffer_.reserve(buffer_size_); + for (int i = 0; i < buffer_size_; ++i) { + if (reader_->HasNext()) { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + } else { + break; + } + } + // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be + // optimize. + std::random_shuffle(buffer_.begin(), buffer_.end()); + iteration_pos_ = 0; + } + out->clear(); + if (!buffer_.empty()) { + std::swap(*out, buffer_[iteration_pos_++]); + } + // if buffer_ is empty, the 'out' will return as an empty vector. +} + +void BatchReader::ReadNext(std::vector* out) { + buffer_.clear(); + buffer_.reserve(batch_size_); + for (int i = 0; i < batch_size_; ++i) { + if (reader_->HasNext()) { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + } else { + break; + } + } + // Concat instances + out->clear(); + if (buffer_.empty()) { + // if buffer_ is empty, the 'out' will return as an empty vector. + return; + } + int out_num = buffer_[0].size(); + out->reserve(out_num); + for (int j = 0; j < out_num; ++j) { + // Merge shape and check date type + std::type_index batch_type = buffer_[0][j].type(); + DDim batch_shape = buffer_[0][j].dims(); + for (size_t i = 1; i < buffer_.size(); ++i) { + std::type_index ins_type = buffer_[i][j].type(); + DDim ins_shape = buffer_[i][j].dims(); + PADDLE_ENFORCE_EQ(batch_type, ins_type); + PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()), + slice_ddim(ins_shape, 1, ins_shape.size())); + PADDLE_ENFORCE_GT(ins_shape[0], 0); + batch_shape[0] += ins_shape[0]; + } + + LoDTensor out_tensor; + out_tensor.Resize(batch_shape); + out_tensor.mutable_data(platform::CPUPlace(), batch_type); + int64_t dst_offset = 0; + + // Merge lod and data + LoD batch_lod; + std::vector top_level_lod({0}); + for (size_t i = 0; i < buffer_.size(); ++i) { + DDim ins_shape = buffer_[i][j].dims(); + LoD ins_lod = buffer_[i][j].lod(); + if (i == 0) { + batch_lod = ins_lod; + } else { + PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size()); + for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) { + auto& lod_level = batch_lod[level_idx]; + for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) { + lod_level.push_back(ins_lod[level_idx][k] + lod_level.back()); + } + } + } + top_level_lod.push_back( + top_level_lod.back() + + (ins_lod.empty() ? ins_shape[0] : (ins_lod[0].size() - 1))); + + Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]); + Copy(buffer_[i][j], platform::CPUPlace(), &dst); + dst_offset += ins_shape[0]; + } + batch_lod.insert(batch_lod.begin(), top_level_lod); + out_tensor.set_lod(batch_lod); + out->push_back(out_tensor); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/reader.h b/paddle/framework/reader.h new file mode 100644 index 0000000000..534894cfbd --- /dev/null +++ b/paddle/framework/reader.h @@ -0,0 +1,161 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/framework/ddim.h" +#include "paddle/framework/lod_tensor_array.h" + +namespace paddle { +namespace framework { + +class ReaderBase { + public: + explicit ReaderBase(const std::vector& shapes) : shapes_(shapes) { + PADDLE_ENFORCE(!shapes_.empty()); + } + virtual void ReadNext(std::vector* out) = 0; + virtual bool HasNext() const = 0; + + virtual void ReInit() = 0; + + DDim shape(size_t idx) const; + std::vector shapes() const { return shapes_; } + void set_shapes(const std::vector& shapes) { shapes_ = shapes; } + + virtual ~ReaderBase() {} + + protected: + std::vector shapes_; +}; + +class FileReader : public ReaderBase { + public: + explicit FileReader(const std::vector& shapes) : ReaderBase(shapes) {} +}; + +class DecoratedReader : public ReaderBase { + public: + explicit DecoratedReader(ReaderBase* reader) + : ReaderBase(reader->shapes()), reader_(reader) { + PADDLE_ENFORCE_NOT_NULL(reader_); + } + + bool HasNext() const override { return reader_->HasNext(); } + + void ReInit() override { reader_->ReInit(); } + + protected: + ReaderBase* reader_; +}; + +// file readers + +template +class RandomDataGenerator : public FileReader { + public: + RandomDataGenerator(const std::vector& shapes, float min, float max) + : FileReader(shapes), min_(min), max_(max) { + PADDLE_ENFORCE_LE( + min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max); + unsigned int seed = std::random_device()(); + engine_.seed(seed); + dist_ = std::uniform_real_distribution(min_, max_); + } + + void ReadNext(std::vector* out) override { + out->clear(); + out->reserve(shapes_.size()); + for (const DDim& shape : shapes_) { + PADDLE_ENFORCE_GE( + shape.size(), 2, + "The rank of reader's output data should be 2 at least.(Now it's %d)", + shape.size()); + LoDTensor out_tensor; + out_tensor.Resize(shape); + T* data = out_tensor.mutable_data(platform::CPUPlace()); + int64_t numel = product(shape); + for (int64_t i = 0; i < numel; ++i) { + data[i] = dist_(engine_); + } + out->push_back(out_tensor); + } + } + + bool HasNext() const override { return true; } + + void ReInit() override { return; } + + private: + float min_; + float max_; + std::minstd_rand engine_; + std::uniform_real_distribution dist_; +}; + +// decorated readers + +class ShuffleReader : public DecoratedReader { + public: + ShuffleReader(ReaderBase* reader, int buffer_size) + : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) { + buffer_.reserve(buffer_size); + } + + void ReadNext(std::vector* out) override; + + private: + int buffer_size_; + std::vector> buffer_; + size_t iteration_pos_; +}; + +class BatchReader : public DecoratedReader { + public: + BatchReader(ReaderBase* reader, int batch_size) + : DecoratedReader(reader), batch_size_(batch_size) { + buffer_.reserve(batch_size_); + } + + void ReadNext(std::vector* out) override; + + private: + int batch_size_; + std::vector> buffer_; +}; + +// The ReaderHolder is used as readers' unified wrapper, +// making it easier to access different type readers in Variables. +class ReaderHolder { + public: + void Reset(ReaderBase* reader) { reader_.reset(reader); } + + ReaderBase* Get() const { return reader_.get(); } + + void ReadNext(std::vector* out) { reader_->ReadNext(out); } + bool HasNext() const { return reader_->HasNext(); } + void ReInit() { reader_->ReInit(); } + + DDim shape(size_t idx) const { return reader_->shape(idx); } + std::vector shapes() const { return reader_->shapes(); } + void set_shapes(const std::vector& shapes) { + reader_->set_shapes(shapes); + } + + private: + std::unique_ptr reader_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc index a0fa467291..2f4d450577 100644 --- a/paddle/framework/shape_inference.cc +++ b/paddle/framework/shape_inference.cc @@ -32,6 +32,16 @@ std::vector InferShapeContext::GetInputsDim( return GetDims(arg_names); } +std::vector InferShapeContext::GetReaderDims( + const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ( + arg_names.size(), 1UL, + "Reader input '%s' should hold one element, but now it holds %d", name, + arg_names.size()); + return this->GetRepeatedDims(arg_names[0]); +} + DDim InferShapeContext::GetInputsElementDim(const std::string &name, int idx) const { const std::vector &names = Inputs(name); @@ -52,6 +62,16 @@ void InferShapeContext::SetOutputsDim(const std::string &name, SetDims(names, dims); } +void InferShapeContext::SetReaderDims(const std::string &name, + const std::vector &dims) { + const std::vector &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ( + arg_names.size(), 1UL, + "Reader output '%s' should hold one element, but now it holds %d", name, + arg_names.size()); + return this->SetRepeatedDims(arg_names[0], dims); +} + std::vector InferShapeContext::GetDims( const std::vector &names) const { std::vector ret; @@ -61,6 +81,7 @@ std::vector InferShapeContext::GetDims( [this](const std::string &name) { return this->GetDim(name); }); return ret; } + void InferShapeContext::SetDims(const std::vector &names, const std::vector &dims) { size_t length = names.size(); @@ -72,14 +93,17 @@ void InferShapeContext::SetDims(const std::vector &names, SetDim(names[i], dims[i]); } } + std::vector InferShapeContext::GetInputsVarType( const std::string &name) const { return GetVarTypes(Inputs(name)); } + std::vector InferShapeContext::GetOutputsVarType( const std::string &name) const { return GetVarTypes(Outputs(name)); } + std::vector InferShapeContext::GetVarTypes( const std::vector &names) const { std::vector retv; diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h index 830f199ed1..7bee869852 100644 --- a/paddle/framework/shape_inference.h +++ b/paddle/framework/shape_inference.h @@ -36,12 +36,13 @@ class InferShapeContext { virtual bool HasOutputs(const std::string &name) const = 0; DDim GetInputDim(const std::string &name) const; - std::vector GetInputsDim(const std::string &name) const; + std::vector GetReaderDims(const std::string &name) const; DDim GetInputsElementDim(const std::string &name, int idx) const; void SetOutputDim(const std::string &name, const DDim &dim); void SetOutputsDim(const std::string &name, const std::vector &dims); + void SetReaderDims(const std::string &name, const std::vector &dims); virtual AttrReader Attrs() const = 0; virtual const std::vector &Inputs( @@ -61,6 +62,9 @@ class InferShapeContext { protected: virtual DDim GetDim(const std::string &name) const = 0; virtual void SetDim(const std::string &name, const DDim &dim) = 0; + virtual std::vector GetRepeatedDims(const std::string &name) const = 0; + virtual void SetRepeatedDims(const std::string &name, + const std::vector &dims) = 0; std::vector GetDims(const std::vector &names) const; std::vector GetVarTypes( diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc index 6d83e2e411..11a4daf2c9 100644 --- a/paddle/framework/var_desc.cc +++ b/paddle/framework/var_desc.cc @@ -57,10 +57,13 @@ size_t VarDesc::GetTensorDescNum() const { void VarDesc::SetShapes( const std::vector> &multiple_dims) { - PADDLE_ENFORCE_EQ(multiple_dims.size(), GetTensorDescNum(), - "The number of given shapes(%d) doesn't equal to the " - "number of sub tensor.", - multiple_dims.size(), GetTensorDescNum()); + if (multiple_dims.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_dims.size()); + } std::vector tensors = mutable_tensor_descs(); for (size_t i = 0; i < multiple_dims.size(); ++i) { VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims()); @@ -87,10 +90,14 @@ void VarDesc::SetDataType(proto::DataType data_type) { void VarDesc::SetDataTypes( const std::vector &multiple_data_type) { - PADDLE_ENFORCE_EQ(multiple_data_type.size(), GetTensorDescNum(), - "The number of given data types(%d) doesn't equal to the " - "number of sub tensor.", - multiple_data_type.size(), GetTensorDescNum()); + if (multiple_data_type.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given data types(" + << multiple_data_type.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_data_type.size()); + } std::vector tensor_descs = mutable_tensor_descs(); for (size_t i = 0; i < multiple_data_type.size(); ++i) { tensor_descs[i]->set_data_type(multiple_data_type[i]); @@ -127,10 +134,14 @@ void VarDesc::SetLoDLevel(int32_t lod_level) { } void VarDesc::SetLoDLevels(const std::vector &multiple_lod_level) { - PADDLE_ENFORCE_EQ(multiple_lod_level.size(), GetTensorDescNum(), - "The number of given data types(%d) doesn't equal to the " - "number of sub tensor.", - multiple_lod_level.size(), GetTensorDescNum()); + if (multiple_lod_level.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given lod_levels(" + << multiple_lod_level.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_lod_level.size()); + } switch (desc_.type()) { case proto::VarDesc::READER: { size_t i = 0; diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h index 5b7a08a087..599d451490 100644 --- a/paddle/framework/var_type.h +++ b/paddle/framework/var_type.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor_array.h" +#include "paddle/framework/reader.h" #include "paddle/framework/selected_rows.h" #include "paddle/framework/variable.h" @@ -31,6 +32,8 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) { return proto::VarDesc_VarType_LOD_TENSOR_ARRAY; } else if (type.hash_code() == typeid(SelectedRows).hash_code()) { return proto::VarDesc_VarType_SELECTED_ROWS; + } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) { + return proto::VarDesc_VarType_READER; } else { PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); } @@ -40,7 +43,7 @@ template inline void VisitVarType(const framework::Variable& var, Visitor visitor) { switch (ToVarType(var.Type())) { case proto::VarDesc_VarType_LOD_TENSOR: - visitor(var.Get()); + visitor(var.Get()); return; case proto::VarDesc_VarType_LOD_RANK_TABLE: visitor(var.Get()); @@ -51,6 +54,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { case proto::VarDesc_VarType_SELECTED_ROWS: visitor(var.Get()); return; + case proto::VarDesc_VarType_READER: + visitor(var.Get()); + return; default: PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); } diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 000c2089c1..25bb7187d3 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -62,7 +62,7 @@ function(op_library TARGET) endif() # Define operators that don't need pybind here. - foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op") + foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() @@ -155,6 +155,7 @@ op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function) op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) +op_library(create_reader_op DEPS reader) # Regist multiple Kernel to pybind if (WITH_GPU) @@ -185,7 +186,7 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) op_library(${src}) endforeach() -file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") +file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n") set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") diff --git a/paddle/operators/create_reader_op.cc b/paddle/operators/create_reader_op.cc new file mode 100644 index 0000000000..5ba2a25ab4 --- /dev/null +++ b/paddle/operators/create_reader_op.cc @@ -0,0 +1,205 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/framework/op_registry.h" +#include "paddle/framework/reader.h" + +namespace paddle { +namespace operators { + +static std::vector RestoreShapes( + const std::vector& shape_concat, const std::vector& ranks) { + std::vector res; + int offset = 0; + for (int len : ranks) { + auto start_it = shape_concat.begin() + offset; + auto end_it = start_it + len; + res.push_back(framework::make_ddim(std::vector(start_it, end_it))); + offset += len; + } + return res; +} + +// general infershape for file readers +class CreateFileReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "The output file reader should not be null."); + const auto shape_concat = + ctx->Attrs().Get>("shape_concat"); + const auto ranks = ctx->Attrs().Get>("ranks"); + std::vector shapes = RestoreShapes(shape_concat, ranks); + ctx->SetReaderDims("Out", shapes); + } +}; + +// general infershape for decorated readers +class CreateDecoratedReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), + "Input(UnderlyingReader) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "The output decorated reader should not be null."); + ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); + } +}; + +// general var type inference for all readers +class CreateReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + std::string reader_name = op_desc.Output("Out")[0]; + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + reader->SetType(framework::proto::VarDesc::READER); + } +}; + +template +class CreateRandomDataGeneratorOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& shape_concat = Attr>("shape_concat"); + const auto& ranks = Attr>("ranks"); + PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); + PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), + int(shape_concat.size()), + "The accumulate of all ranks should be equal to the " + "shape concat's length."); + std::vector shapes = RestoreShapes(shape_concat, ranks); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::RandomDataGenerator(shapes, Attr("min"), + Attr("max"))); + } +}; + +class CreateRandomDataGeneratorOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddOutput("Out", "(ReaderHolder) The created random reader."); + AddAttr>("shape_concat", + "The concat of all data's shapes."); + AddAttr>( + "ranks", + "The ranks of each data." + "e.g." + "shape_concat = [2,3,4,5,6]" + "ranks = [3,2]" + "It means the reader will generate two data each time," + "whose shapes are [2,3,4] and [5,6] respectively."); + AddAttr("min", "The lower bound of reader's uniform distribution."); + AddAttr("max", "The upper bound of reader's uniform distribution."); + AddComment(R"DOC( + CreateRandomDataGenerator Operator + + This Op creates a random reader. + The reader generates random data instead of really reading from files. + Generated data follow an uniform distribution between 'min' and 'max'. + )DOC"); + } +}; + +class CreateShuffleReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::ShuffleReader(underlying_reader.Get(), + Attr("buffer_size"))); + } +}; + +class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput( + "UnderlyingReader", + "(ReaderHolder) The underlying reader for creating a shuffle reader."); + AddOutput("Out", "(ReaderHolder) The created shuffle reader."); + AddAttr("buffer_size", "The shuffle buffer size.").GreaterThan(0); + AddComment(R"DOC( + CreateShuffleReader Operator + + A shuffle reader takes another reader as its 'underlying reader' + and yields the underlying reader's outputs in a shuffled order. + )DOC"); + } +}; + +class CreateBatchReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::BatchReader(underlying_reader.Get(), + Attr("batch_size"))); + } +}; + +class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput( + "UnderlyingReader", + "(ReaderHolder) The underlying reader for creating a batch reader."); + AddOutput("Out", "(ReaderHolder) The created batch reader."); + AddAttr("batch_size", + "How many instances the batch reader yields each time.") + .GreaterThan(0); + AddComment(R"DOC( + CreateBatchReader Operator + + A batch reader takes another reader as its 'underlying reader', + gathers the underlying reader's outputs and then yields them in batches. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(create_random_data_generator, + ops::CreateRandomDataGeneratorOp, + ops::CreateFileReaderInferShape, + ops::CreateRandomDataGeneratorOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateReaderInferVarType); +REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp, + ops::CreateDecoratedReaderInferShape, + ops::CreateShuffleReaderOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateReaderInferVarType); +REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp, + ops::CreateDecoratedReaderInferShape, + ops::CreateBatchReaderOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateReaderInferVarType); diff --git a/paddle/operators/read_op.cc b/paddle/operators/read_op.cc new file mode 100644 index 0000000000..3ae454101f --- /dev/null +++ b/paddle/operators/read_op.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/framework/op_registry.h" +#include "paddle/framework/reader.h" + +namespace paddle { +namespace operators { + +class ReadInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Reader"), + "The ReadOp must take a reader as input."); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), + "The ReadOp should be assigned with output."); + std::vector reader_dims = ctx->GetReaderDims("Reader"); + std::vector out_names = ctx->Outputs("Out"); + PADDLE_ENFORCE_EQ( + reader_dims.size(), out_names.size(), + "The reader's dim number doesn't match the output number."); + ctx->SetOutputsDim("Out", reader_dims); + } +}; + +class ReadInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + std::string reader_name = op_desc.Input("Reader")[0]; + std::vector out_names = op_desc.Output("Out"); + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + auto dtypes = reader->GetDataTypes(); + PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + for (size_t i = 0; i < dtypes.size(); ++i) { + framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); + out.SetType(framework::proto::VarDesc::LOD_TENSOR); + out.SetDataType(dtypes[i]); + } + } +}; + +class ReadOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + framework::ReaderHolder* reader = + scope.FindVar(Input("Reader"))->GetMutable(); + if (!reader->HasNext()) { + reader->ReInit(); + PADDLE_ENFORCE( + reader->HasNext(), + "Reader can not read the next data even it has been re-initialized."); + } + std::vector out_arg_names = Outputs("Out"); + std::vector ins; + reader->ReadNext(&ins); + PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); + for (size_t i = 0; i < ins.size(); ++i) { + auto* out = + scope.FindVar(out_arg_names[i])->GetMutable(); + out->ShareDataWith(ins[i]); + out->set_lod(ins[i].lod()); + } + } +}; + +class ReadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput("Reader", "(ReaderHolder) The executed reader."); + AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable(); + AddComment(R"DOC( + Read Operator + + Execute a given reader once and output data. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker, + paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType); diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 0f1953abe0..0a92e10927 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -217,8 +217,6 @@ void BindVarDsec(py::module &m) { .def("set_shapes", &VarDesc::SetShapes) .def("set_dtype", &VarDesc::SetDataType) .def("set_dtypes", &VarDesc::SetDataTypes) - .def("set_tensor_num", &VarDesc::SetTensorDescNum) - .def("tensor_num", &VarDesc::GetTensorDescNum) .def("shape", &VarDesc::GetShape, py::return_value_policy::reference) .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference) .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference) diff --git a/python/paddle/v2/fluid/tests/test_cpp_reader.py b/python/paddle/v2/fluid/tests/test_cpp_reader.py new file mode 100644 index 0000000000..e71c3a290c --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py @@ -0,0 +1,62 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import numpy as np + +prog = fluid.framework.Program() +block = prog.current_block() + +random_reader = block.create_var( + type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator") +random_reader.desc.set_lod_levels([0, 0]) + +create_random_data_generator_op = block.append_op( + type="create_random_data_generator", + outputs={"Out": random_reader}, + attrs={ + "shape_concat": [1, 2, 1, 1], + "ranks": [2, 2], + "min": 0.0, + "max": 1.0 + }) + +out1 = block.create_var( + type=fluid.core.VarDesc.VarType.LOD_TENSOR, + name="Out1", + shape=[10, 2], + dtype="float32", + lod_level=1) +out2 = block.create_var( + type=fluid.core.VarDesc.VarType.LOD_TENSOR, + name="Out2", + shape=[10, 1], + dtype="float32", + lod_level=1) + +read_op = block.append_op( + type="read", + inputs={"Reader": random_reader}, + outputs={"Out": [out1, out2]}) + +place = fluid.CPUPlace() +exe = fluid.Executor(place) + +[res1, res2] = exe.run(prog, fetch_list=[out1, out2]) + +if len(res1) == 0 or len(res2) == 0: + exit(1) + +exit(0) diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py index 8f335d13db..c590bf1c65 100644 --- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py +++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py @@ -120,7 +120,6 @@ class TestVarDesc(unittest.TestCase): block = program_desc.block(0) var = block.var('my_reader') var.set_type(core.VarDesc.VarType.READER) - var.set_tensor_num(3) src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]] var.set_shapes(src_shapes) res_shapes = var.shapes() @@ -141,7 +140,6 @@ class TestVarDesc(unittest.TestCase): block = program_desc.block(0) var = block.var('my_reader') var.set_type(core.VarDesc.VarType.READER) - var.set_tensor_num(3) src_types = [ core.DataType.INT32, core.DataType.FP64, core.DataType.FP32 ] @@ -154,7 +152,6 @@ class TestVarDesc(unittest.TestCase): block = program_desc.block(0) var = block.var('my_reader') var.set_type(core.VarDesc.VarType.READER) - var.set_tensor_num(3) src_types = [3, 1, 2] var.set_lod_levels(src_types) self.assertEqual(src_types, var.lod_levels())