From 7f07dfa1a4cdadbdfd4b24d342dafefb316ca7c1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 1 Dec 2018 15:30:26 +0800 Subject: [PATCH 001/123] clean code --- .../fluid/operators/reader/ctr_reader_test.cc | 2 +- .../reader/lod_tensor_blocking_queue.h | 12 ++++-------- paddle/fluid/pybind/pybind.cc | 19 ++++++------------- python/paddle/fluid/layers/io.py | 2 +- 4 files changed, 12 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 8dba9baebc..5e672e9aa1 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -126,7 +126,7 @@ TEST(CTR_READER, read_data) { LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; - queue_holder.InitOnce(capacity, {}, false); + queue_holder.InitOnce(capacity, false); std::shared_ptr queue = queue_holder.GetQueue(); diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 3f041ff7e4..5b53edff5d 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -32,10 +32,8 @@ class LoDTensorBlockingQueue { friend class LoDTensorBlockingQueueHolder; private: - LoDTensorBlockingQueue(size_t capacity, - const std::vector& dims, - bool speed_test_mode = false) - : queue_(capacity, speed_test_mode), dims_(dims) {} + explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false) + : queue_(capacity, speed_test_mode) {} public: bool Push(const std::vector& lod_tensor_vec) { @@ -65,17 +63,15 @@ class LoDTensorBlockingQueue { private: BlockingQueue> queue_; - std::vector dims_; }; class LoDTensorBlockingQueueHolder { public: - void InitOnce(size_t capacity, const std::vector& dims, - bool speed_test_mode = false) { + void InitOnce(size_t capacity, bool speed_test_mode = false) { PADDLE_ENFORCE( queue_ == nullptr, "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); - queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode)); + queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode)); } inline const std::shared_ptr& GetQueue() const { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc7991d297..f0a5d1afc9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -384,19 +384,12 @@ All parameter, weight, gradient are variables in Paddle. .def("is_closed", &LoDTensorBlockingQueue::IsClosed); m.def("init_lod_tensor_blocking_queue", - [](Variable &var, size_t capacity, - const std::vector> &shapes) - -> std::shared_ptr { - std::vector dims(shapes.size()); - std::transform(shapes.begin(), shapes.end(), dims.begin(), - [](const std::vector &shape) { - return make_ddim(shape); - }); - auto *holder = var.GetMutable(); - holder->InitOnce(capacity, dims, - FLAGS_reader_queue_speed_test_mode); - return holder->GetQueue(); - }, + [](Variable &var, + size_t capacity) -> std::shared_ptr { + auto *holder = var.GetMutable(); + holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); + return holder->GetQueue(); + }, py::return_value_policy::copy); py::class_(m, "Scope", R"DOC( diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 3f47053961..3016d8e3a4 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -523,7 +523,7 @@ def _py_reader(capacity, double_buffer_name = "_".join([name, "double_buffer"]) var = global_scope().var(queue_name) - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) startup_blk = default_startup_program().current_block() startup_var = startup_blk.create_var(name=reader_name) From 978fd6800cb05ddbf7d912aa2ba776e29647b8ac Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 1 Dec 2018 16:49:04 +0800 Subject: [PATCH 002/123] update ctr_reader.py --- python/paddle/fluid/contrib/reader/ctr_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py index b8449e8d84..d7133562de 100644 --- a/python/paddle/fluid/contrib/reader/ctr_reader.py +++ b/python/paddle/fluid/contrib/reader/ctr_reader.py @@ -90,7 +90,7 @@ def ctr_reader(feed_data, reader_name = "_".join([name, "reader"]) var = global_scope().var(queue_name) - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity) startup_blk = default_startup_program().current_block() reader_var = startup_blk.create_var(name=reader_name) From 2cd25794bd186600e79ac94eb9a93593e1d5fbb1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 2 Dec 2018 11:47:37 +0800 Subject: [PATCH 003/123] add PlainFileReader --- paddle/fluid/operators/reader/ctr_reader.cc | 26 +++++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index d1d3ddc89d..e2f8788a9a 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -95,11 +95,27 @@ class GzipReader : public Reader { igzstream gzstream_; }; -class MultiGzipReader : public Reader { +class PlainFileReader : public Reader { public: - explicit MultiGzipReader(const std::vector& file_list) { + explicit PlainFileReader(const std::string& file_name) + : myfile_(file_name.c_str()) {} + + ~PlainFileReader() {} + + bool HasNext() override { return myfile_.peek() != EOF; } + + void NextLine(std::string* line) override { std::getline(myfile_, *line); } + + private: + std::ifstream myfile_; +}; + +template +class MultiFileReader : public Reader { + public: + explicit MultiFileReader(const std::vector& file_list) { for (auto& file : file_list) { - readers_.emplace_back(std::make_shared(file)); + readers_.emplace_back(std::make_shared(file)); } } @@ -119,7 +135,7 @@ class MultiGzipReader : public Reader { } private: - std::vector> readers_; + std::vector> readers_; size_t current_reader_index_ = 0; }; @@ -166,7 +182,7 @@ void ReadThread(const std::vector& file_list, std::vector>> batch_data; std::vector batch_label; - MultiGzipReader reader(file_list); + MultiFileReader reader(file_list); VLOG(30) << "reader inited"; From a05a948d89cb3abdb7f60c8ffbf74fdd59b35a7b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 2 Dec 2018 13:07:06 +0800 Subject: [PATCH 004/123] update readthread --- .../operators/reader/create_ctr_reader_op.cc | 24 ++++++--- paddle/fluid/operators/reader/ctr_reader.cc | 53 +++++++++++-------- paddle/fluid/operators/reader/ctr_reader.h | 29 +++++++--- .../fluid/operators/reader/ctr_reader_test.cc | 15 +++--- 4 files changed, 77 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index 58a465d87a..e66263fee1 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -41,13 +41,16 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto* queue_holder = queue_holder_var->template GetMutable(); - int thread_num = Attr("thread_num"); - std::vector slots = Attr>("slots"); - int batch_size = Attr("batch_size"); - std::vector file_list = - Attr>("file_list"); - out->Reset(std::make_shared(queue_holder->GetQueue(), batch_size, - thread_num, slots, file_list)); + auto thread_num = Attr("thread_num"); + auto sparse_slots = Attr>("sparse_slots"); + auto dense_slots = Attr>("dense_slots"); + auto batch_size = Attr("batch_size"); + auto file_type = Attr("file_type"); + auto file_format = Attr("file_format"); + auto file_list = Attr>("file_list"); + out->Reset(std::make_shared( + queue_holder->GetQueue(), batch_size, thread_num, file_type, + file_format, dense_slots, sparse_slots, file_list)); } }; @@ -58,10 +61,15 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { "Name of the `LoDTensorBlockingQueueHolder` variable"); AddAttr("thread_num", "the thread num to read data"); AddAttr("batch_size", "the batch size of read data"); + AddAttr("file_type", "plain or gzip").SetDefault("plain"); + AddAttr("file_format", "svm or csv").SetDefault("csv"); AddAttr>("file_list", "The list of files that need to read"); AddAttr>( - "slots", "the slots that should be extract from file"); + "dense_slots", "the sparse slots id that should be extract from file") + .SetDefault({}); + AddAttr>( + "sparse_slots", "the sparse slots id that should be extract from file"); AddComment(R"DOC( Create CTRReader to support read ctr data with cpp. diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index e2f8788a9a..0993957656 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -141,40 +141,42 @@ class MultiFileReader : public Reader { void MonitorThread(std::vector* thread_status, std::shared_ptr queue) { - VLOG(30) << "monitor thread in"; + VLOG(3) << "monitor thread in"; bool reader_thread_is_running = true; while (reader_thread_is_running) { - VLOG(30) << "reader_thread_is_running"; + VLOG(3) << "reader_thread_is_running"; reader_thread_is_running = false; for (size_t i = 0; i < (*thread_status).size(); ++i) { if ((*thread_status)[i] == Running) { - VLOG(30) << "reader is running!"; + VLOG(3) << "reader is running!"; reader_thread_is_running = true; } } std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } - VLOG(30) << "all reader thread is stopped, push empty data into queue"; + VLOG(3) << "all reader thread is stopped, push empty data into queue"; queue->Push({}); - VLOG(30) << "monitor thread exited"; + VLOG(3) << "monitor thread exited"; } void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, + const std::string& file_type, const std::string& file_format, + const std::vector& dense_slots, + const std::vector& sparse_slots, int batch_size, int thread_id, std::vector* thread_status, std::shared_ptr queue) { - VLOG(30) << "[" << thread_id << "]" - << " reader thread start! thread_id = " << thread_id; + VLOG(3) << "[" << thread_id << "]" + << " reader thread start! thread_id = " << thread_id; for (auto& file : file_list) { - VLOG(30) << "[" << thread_id << "]" - << " file " << file; + VLOG(3) << "[" << thread_id << "]" + << " file " << file; } (*thread_status)[thread_id] = Running; - VLOG(30) << "set status to running"; + VLOG(3) << "set status to running"; std::unordered_map slot_to_index; - for (size_t i = 0; i < slots.size(); ++i) { - slot_to_index[slots[i]] = i; + for (size_t i = 0; i < sparse_slots.size(); ++i) { + slot_to_index[sparse_slots[i]] = i; } std::string line; @@ -182,11 +184,18 @@ void ReadThread(const std::vector& file_list, std::vector>> batch_data; std::vector batch_label; - MultiFileReader reader(file_list); + std::unique_ptr reader; + if (file_type == "gzip") { + reader.reset(new MultiFileReader(file_list)); + } else if (file_type == "plain") { + reader.reset(new MultiFileReader(file_list)); + } else { + PADDLE_THROW("do not support file format %s", file_type); + } - VLOG(30) << "reader inited"; + VLOG(3) << "reader inited"; - while (reader.HasNext()) { + while (reader->HasNext()) { batch_data.clear(); batch_data.reserve(batch_size); @@ -195,8 +204,8 @@ void ReadThread(const std::vector& file_list, // read batch_size data for (int i = 0; i < batch_size; ++i) { - if (reader.HasNext()) { - reader.NextLine(&line); + if (reader->HasNext()) { + reader->NextLine(&line); std::unordered_map> slot_to_data; int64_t label; parse_line(line, slot_to_index, &label, &slot_to_data); @@ -209,8 +218,8 @@ void ReadThread(const std::vector& file_list, std::vector lod_datas; - // first insert tensor for each slots - for (auto& slot : slots) { + // first insert tensor for each sparse_slots + for (auto& slot : sparse_slots) { std::vector lod_data{0}; std::vector batch_feasign; @@ -242,11 +251,11 @@ void ReadThread(const std::vector& file_list, lod_datas.push_back(label_tensor); queue->Push(lod_datas); - VLOG(40) << "push one data, queue_size=" << queue->Size(); + VLOG(4) << "push one data, queue_size=" << queue->Size(); } (*thread_status)[thread_id] = Stopped; - VLOG(30) << "set status to stopped, thread " << thread_id << " exited"; + VLOG(3) << "set status to stopped, thread " << thread_id << " exited"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae1..68d587bbfc 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -36,7 +36,9 @@ namespace reader { enum ReaderThreadStatus { Running, Stopped }; void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, + const std::string& file_type, const std::string& file_format, + const std::vector& dense_slots, + const std::vector& sparse_slots, int batch_size, int thread_id, std::vector* thread_status, std::shared_ptr queue); @@ -47,11 +49,18 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: - explicit CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, - const std::vector& slots, - const std::vector& file_list) - : batch_size_(batch_size), slots_(slots), file_list_(file_list) { + CTRReader(const std::shared_ptr& queue, + int batch_size, int thread_num, const std::string& file_type, + const std::string& file_format, + const std::vector& dense_slots, + const std::vector& sparse_slots, + const std::vector& file_list) + : batch_size_(batch_size), + file_type_(file_type), + file_format_(file_format), + dense_slots_(dense_slots), + sparse_slots_(sparse_slots), + file_list_(file_list) { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); @@ -97,7 +106,8 @@ class CTRReader : public framework::FileReader { VLOG(3) << "thread_num " << thread_num_; for (int thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, + std::bind(&ReadThread, file_groups_[thread_id], file_type_, + file_format_, dense_slots_, sparse_slots_, batch_size_, thread_id, &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( @@ -119,7 +129,10 @@ class CTRReader : public framework::FileReader { private: size_t thread_num_; const int batch_size_; - const std::vector slots_; + const std::string file_type_; + const std::string file_format_; + const std::vector dense_slots_; + const std::vector sparse_slots_; const std::vector file_list_; std::shared_ptr queue_; std::vector> read_threads_; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 5e672e9aa1..734bf45383 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -132,24 +132,27 @@ TEST(CTR_READER, read_data) { int batch_size = 3; int thread_num = 1; - std::vector slots = {"6002", "6003"}; + std::vector sparse_slots = {"6002", "6003"}; std::vector file_list; for (int i = 0; i < thread_num; ++i) { file_list.push_back(gz_file_name); } - CTRReader reader(queue, batch_size, thread_num, slots, file_list); + CTRReader reader(queue, batch_size, thread_num, "gzip", "plain", {}, + sparse_slots, file_list); reader.Start(); size_t batch_num = std::ceil(static_cast(ctr_data.size()) / batch_size) * thread_num; - check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, - data_slot_6003, batch_num, batch_size, queue, &reader); + check_all_data(ctr_data, sparse_slots, label_dims, label_value, + data_slot_6002, data_slot_6003, batch_num, batch_size, queue, + &reader); reader.Shutdown(); reader.Start(); - check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, - data_slot_6003, batch_num, batch_size, queue, &reader); + check_all_data(ctr_data, sparse_slots, label_dims, label_value, + data_slot_6002, data_slot_6003, batch_num, batch_size, queue, + &reader); reader.Shutdown(); } From d7c8ebac2eafc87e887dcf9f4e38b9d3f7661d1d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 2 Dec 2018 14:40:43 +0800 Subject: [PATCH 005/123] add datadesc --- .../operators/reader/create_ctr_reader_op.cc | 25 ++++--- paddle/fluid/operators/reader/ctr_reader.cc | 39 +++++++---- paddle/fluid/operators/reader/ctr_reader.h | 68 +++++++++++-------- .../fluid/operators/reader/ctr_reader_test.cc | 7 +- 4 files changed, 87 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index e66263fee1..5b9e2ba693 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -43,14 +43,16 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto thread_num = Attr("thread_num"); auto sparse_slots = Attr>("sparse_slots"); - auto dense_slots = Attr>("dense_slots"); + auto dense_slot_index = Attr>("dense_slot_index"); + auto sparse_slot_index = Attr>("sparse_slot_index"); auto batch_size = Attr("batch_size"); auto file_type = Attr("file_type"); auto file_format = Attr("file_format"); auto file_list = Attr>("file_list"); - out->Reset(std::make_shared( - queue_holder->GetQueue(), batch_size, thread_num, file_type, - file_format, dense_slots, sparse_slots, file_list)); + DataDesc data_desc(batch_size, file_list, file_type, file_format, + dense_slot_index, sparse_slot_index, sparse_slots); + out->Reset(std::make_shared(queue_holder->GetQueue(), thread_num, + data_desc)); } }; @@ -65,11 +67,18 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { AddAttr("file_format", "svm or csv").SetDefault("csv"); AddAttr>("file_list", "The list of files that need to read"); - AddAttr>( - "dense_slots", "the sparse slots id that should be extract from file") + AddAttr>( + "dense_slot_index", + "the sparse slots id that should be extract from file") .SetDefault({}); - AddAttr>( - "sparse_slots", "the sparse slots id that should be extract from file"); + AddAttr>( + "dense_slot_index", + "the sparse slots id that should be extract from file") + .SetDefault({}); + AddAttr>("sparse_slots", + "the sparse slots id that should be " + "extract from file, used when file " + "format is svm"); AddComment(R"DOC( Create CTRReader to support read ctr data with cpp. diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 0993957656..0af55b503e 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -73,6 +73,21 @@ static inline void parse_line( } } +// label slot1:fea_sign slot2:fea_sign slot1:fea_sign +static inline void parse_svm_line(const std::string& line) {} + +// label,dense_fea,dense_fea,sparse_fea,sparse_fea +static inline void parse_csv_line(const std::string& line, + const std::vector& dense_slots, + const std::vector& sparse_slots, + int64_t* label, + std::vector* dense_datas, + std::vector* sparse_datas) { + std::vector ret; + string_split(line, ',', &ret); + *label = std::stoi(ret[2]) > 0; +} + class Reader { public: virtual ~Reader() {} @@ -160,10 +175,8 @@ void MonitorThread(std::vector* thread_status, } void ReadThread(const std::vector& file_list, - const std::string& file_type, const std::string& file_format, - const std::vector& dense_slots, - const std::vector& sparse_slots, int batch_size, - int thread_id, std::vector* thread_status, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, std::shared_ptr queue) { VLOG(3) << "[" << thread_id << "]" << " reader thread start! thread_id = " << thread_id; @@ -175,8 +188,8 @@ void ReadThread(const std::vector& file_list, VLOG(3) << "set status to running"; std::unordered_map slot_to_index; - for (size_t i = 0; i < sparse_slots.size(); ++i) { - slot_to_index[sparse_slots[i]] = i; + for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) { + slot_to_index[data_desc.sparse_slot_ids_[i]] = i; } std::string line; @@ -185,25 +198,25 @@ void ReadThread(const std::vector& file_list, std::vector batch_label; std::unique_ptr reader; - if (file_type == "gzip") { + if (data_desc.file_type_ == "gzip") { reader.reset(new MultiFileReader(file_list)); - } else if (file_type == "plain") { + } else if (data_desc.file_type_ == "plain") { reader.reset(new MultiFileReader(file_list)); } else { - PADDLE_THROW("do not support file format %s", file_type); + PADDLE_THROW("do not support file format %s", data_desc.file_type_); } VLOG(3) << "reader inited"; while (reader->HasNext()) { batch_data.clear(); - batch_data.reserve(batch_size); + batch_data.reserve(data_desc.batch_size_); batch_label.clear(); - batch_label.reserve(batch_size); + batch_label.reserve(data_desc.batch_size_); // read batch_size data - for (int i = 0; i < batch_size; ++i) { + for (int i = 0; i < data_desc.batch_size_; ++i) { if (reader->HasNext()) { reader->NextLine(&line); std::unordered_map> slot_to_data; @@ -219,7 +232,7 @@ void ReadThread(const std::vector& file_list, std::vector lod_datas; // first insert tensor for each sparse_slots - for (auto& slot : sparse_slots) { + for (auto& slot : data_desc.sparse_slot_ids_) { std::vector lod_data{0}; std::vector batch_feasign; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 68d587bbfc..1f4663e3b8 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -35,11 +35,34 @@ namespace reader { enum ReaderThreadStatus { Running, Stopped }; +struct DataDesc { + DataDesc(int batch_size, const std::vector& file_names, + const std::string& file_type, const std::string& file_format, + const std::vector& dense_slot_index, + const std::vector& sparse_slot_index, + const std::vector& sparse_slot_ids) + : batch_size_(batch_size), + file_names_(file_names), + file_type_(file_type), + file_format_(file_format), + dense_slot_index_(dense_slot_index), + sparse_slot_index_(sparse_slot_index), + sparse_slot_ids_(sparse_slot_ids) {} + + const int batch_size_; + const std::vector file_names_; + const std::string file_type_; // gzip or plain + const std::string file_format_; // csv or svm + // used for csv data format + const std::vector dense_slot_index_; + const std::vector sparse_slot_index_; + // used for svm data format + const std::vector sparse_slot_ids_; +}; + void ReadThread(const std::vector& file_list, - const std::string& file_type, const std::string& file_format, - const std::vector& dense_slots, - const std::vector& sparse_slots, int batch_size, - int thread_id, std::vector* thread_status, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, std::shared_ptr queue); // monitor all running thread, if they are all stopped, @@ -50,22 +73,15 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, const std::string& file_type, - const std::string& file_format, - const std::vector& dense_slots, - const std::vector& sparse_slots, - const std::vector& file_list) - : batch_size_(batch_size), - file_type_(file_type), - file_format_(file_format), - dense_slots_(dense_slots), - sparse_slots_(sparse_slots), - file_list_(file_list) { + int thread_num, const DataDesc& data_desc) + : data_desc_(data_desc) { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); - PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = - file_list_.size() > thread_num ? thread_num : file_list_.size(); + PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0, + "file list should not be empty"); + thread_num_ = data_desc_.file_names_.size() > thread_num + ? thread_num + : data_desc_.file_names_.size(); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { @@ -106,9 +122,8 @@ class CTRReader : public framework::FileReader { VLOG(3) << "thread_num " << thread_num_; for (int thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], file_type_, - file_format_, dense_slots_, sparse_slots_, batch_size_, - thread_id, &read_thread_status_, queue_))); + std::bind(&ReadThread, file_groups_[thread_id], data_desc_, thread_id, + &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( std::bind(&MonitorThread, &read_thread_status_, queue_))); @@ -118,8 +133,8 @@ class CTRReader : public framework::FileReader { private: void SplitFiles() { file_groups_.resize(thread_num_); - for (size_t i = 0; i < file_list_.size(); ++i) { - auto& file_name = file_list_[i]; + for (size_t i = 0; i < data_desc_.file_names_.size(); ++i) { + auto& file_name = data_desc_.file_names_[i]; std::ifstream f(file_name.c_str()); PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name); file_groups_[i % thread_num_].push_back(file_name); @@ -128,12 +143,7 @@ class CTRReader : public framework::FileReader { private: size_t thread_num_; - const int batch_size_; - const std::string file_type_; - const std::string file_format_; - const std::vector dense_slots_; - const std::vector sparse_slots_; - const std::vector file_list_; + const DataDesc data_desc_; std::shared_ptr queue_; std::vector> read_threads_; std::unique_ptr monitor_thread_; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 734bf45383..a14e21bc8d 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -36,6 +36,7 @@ using paddle::framework::LoD; using paddle::framework::DDim; using paddle::platform::CPUPlace; using paddle::framework::make_ddim; +using paddle::operators::reader::DataDesc; static void generatedata(const std::vector& data, const std::string& file_name) { @@ -138,8 +139,10 @@ TEST(CTR_READER, read_data) { file_list.push_back(gz_file_name); } - CTRReader reader(queue, batch_size, thread_num, "gzip", "plain", {}, - sparse_slots, file_list); + DataDesc data_desc(batch_size, file_list, "gzip", "plain", {}, {}, + sparse_slots); + + CTRReader reader(queue, thread_num, data_desc); reader.Start(); size_t batch_num = From fbd6f50148bb7eaf40ced1964737b2550ab746a1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 2 Dec 2018 14:55:35 +0800 Subject: [PATCH 006/123] add ReadSvmData --- paddle/fluid/operators/reader/ctr_reader.cc | 67 +++++++++++-------- .../fluid/operators/reader/ctr_reader_test.cc | 2 +- 2 files changed, 40 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 0af55b503e..9834d7183a 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -78,14 +78,18 @@ static inline void parse_svm_line(const std::string& line) {} // label,dense_fea,dense_fea,sparse_fea,sparse_fea static inline void parse_csv_line(const std::string& line, - const std::vector& dense_slots, - const std::vector& sparse_slots, - int64_t* label, + const DataDesc& data_desc, int64_t* label, std::vector* dense_datas, std::vector* sparse_datas) { std::vector ret; string_split(line, ',', &ret); - *label = std::stoi(ret[2]) > 0; + *label = std::stol(ret[2]) > 0; + for (auto& idx : data_desc.dense_slot_index_) { + dense_datas->push_back(std::stof(ret[idx])); + } + for (auto& idx : data_desc.sparse_slot_index_) { + sparse_datas->push_back(std::stol(ret[idx])); + } } class Reader { @@ -174,19 +178,8 @@ void MonitorThread(std::vector* thread_status, VLOG(3) << "monitor thread exited"; } -void ReadThread(const std::vector& file_list, - const DataDesc& data_desc, int thread_id, - std::vector* thread_status, - std::shared_ptr queue) { - VLOG(3) << "[" << thread_id << "]" - << " reader thread start! thread_id = " << thread_id; - for (auto& file : file_list) { - VLOG(3) << "[" << thread_id << "]" - << " file " << file; - } - (*thread_status)[thread_id] = Running; - VLOG(3) << "set status to running"; - +void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { std::unordered_map slot_to_index; for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) { slot_to_index[data_desc.sparse_slot_ids_[i]] = i; @@ -197,17 +190,6 @@ void ReadThread(const std::vector& file_list, std::vector>> batch_data; std::vector batch_label; - std::unique_ptr reader; - if (data_desc.file_type_ == "gzip") { - reader.reset(new MultiFileReader(file_list)); - } else if (data_desc.file_type_ == "plain") { - reader.reset(new MultiFileReader(file_list)); - } else { - PADDLE_THROW("do not support file format %s", data_desc.file_type_); - } - - VLOG(3) << "reader inited"; - while (reader->HasNext()) { batch_data.clear(); batch_data.reserve(data_desc.batch_size_); @@ -266,6 +248,35 @@ void ReadThread(const std::vector& file_list, queue->Push(lod_datas); VLOG(4) << "push one data, queue_size=" << queue->Size(); } +} + +void ReadThread(const std::vector& file_list, + const DataDesc& data_desc, int thread_id, + std::vector* thread_status, + std::shared_ptr queue) { + VLOG(3) << "[" << thread_id << "]" + << " reader thread start! thread_id = " << thread_id; + for (auto& file : file_list) { + VLOG(3) << "[" << thread_id << "]" + << " file " << file; + } + (*thread_status)[thread_id] = Running; + VLOG(3) << "set status to running"; + + std::shared_ptr reader; + if (data_desc.file_type_ == "gzip") { + reader.reset(new MultiFileReader(file_list)); + } else if (data_desc.file_type_ == "plain") { + reader.reset(new MultiFileReader(file_list)); + } else { + PADDLE_THROW("do not support file format %s", data_desc.file_type_); + } + + VLOG(3) << "reader inited"; + + if (data_desc.file_format_ == "svm") { + ReadSvmData(data_desc, reader, queue); + } (*thread_status)[thread_id] = Stopped; VLOG(3) << "set status to stopped, thread " << thread_id << " exited"; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index a14e21bc8d..dfdaae3a04 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -139,7 +139,7 @@ TEST(CTR_READER, read_data) { file_list.push_back(gz_file_name); } - DataDesc data_desc(batch_size, file_list, "gzip", "plain", {}, {}, + DataDesc data_desc(batch_size, file_list, "gzip", "svm", {}, {}, sparse_slots); CTRReader reader(queue, thread_num, data_desc); From 9f53aad13ad840a2b49546bb5832eb74ee268687 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 3 Dec 2018 10:16:40 +0800 Subject: [PATCH 007/123] add test for read csv data --- paddle/fluid/operators/reader/ctr_reader.cc | 144 ++++++++++++++++-- .../fluid/operators/reader/ctr_reader_test.cc | 68 +++++++++ 2 files changed, 196 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 9834d7183a..3595d771b4 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -76,22 +76,6 @@ static inline void parse_line( // label slot1:fea_sign slot2:fea_sign slot1:fea_sign static inline void parse_svm_line(const std::string& line) {} -// label,dense_fea,dense_fea,sparse_fea,sparse_fea -static inline void parse_csv_line(const std::string& line, - const DataDesc& data_desc, int64_t* label, - std::vector* dense_datas, - std::vector* sparse_datas) { - std::vector ret; - string_split(line, ',', &ret); - *label = std::stol(ret[2]) > 0; - for (auto& idx : data_desc.dense_slot_index_) { - dense_datas->push_back(std::stof(ret[idx])); - } - for (auto& idx : data_desc.sparse_slot_index_) { - sparse_datas->push_back(std::stol(ret[idx])); - } -} - class Reader { public: virtual ~Reader() {} @@ -250,6 +234,132 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, } } +// label dense_fea,dense_fea sparse_fea,sparse_fea +static inline void parse_csv_line( + const std::string& line, const DataDesc& data_desc, int64_t* label, + std::vector>* dense_datas, + std::vector>* sparse_datas) { + std::vector ret; + string_split(line, ' ', &ret); + *label = std::stol(ret[0]); + dense_datas->resize(data_desc.dense_slot_index_.size()); + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + int slot_idx = data_desc.dense_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(ret[slot_idx], ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + (*dense_datas)[i].push_back(std::stof(data_str)); + } + } + sparse_datas->resize(data_desc.sparse_slot_index_.size()); + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + int slot_idx = data_desc.sparse_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(ret[slot_idx], ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + (*sparse_datas)[i].push_back(std::stol(data_str)); + } + } +} + +void ReadCsvData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { + std::string line; + while (reader->HasNext()) { + std::vector batch_label; + batch_label.reserve(data_desc.batch_size_); + + std::vector>> batch_dense_data; + batch_dense_data.reserve(data_desc.batch_size_); + + std::vector>> batch_sparse_data; + batch_sparse_data.reserve(data_desc.batch_size_); + + // read batch_size data + for (int i = 0; i < data_desc.batch_size_; ++i) { + if (reader->HasNext()) { + reader->NextLine(&line); + int64_t label; + std::vector> dense_datas; + std::vector> sparse_datas; + parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas); + batch_label.push_back(label); + if (!batch_dense_data.empty()) { + PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(), + "dense data should have the same shape"); + } + batch_dense_data.push_back(dense_datas); + batch_sparse_data.push_back(sparse_datas); + } else { + break; + } + } + + // the order of output data is label, dense_datas, sparse_datas + std::vector lod_datas; + + // insert label tensor + framework::LoDTensor label_tensor; + auto* label_tensor_data = label_tensor.mutable_data( + framework::make_ddim({static_cast(batch_label.size()), 1}), + platform::CPUPlace()); + memcpy(label_tensor_data, batch_label.data(), + batch_label.size() * sizeof(int64_t)); + auto dim = + framework::make_ddim({static_cast(batch_label.size()), 1}); + lod_datas.push_back(label_tensor); + + // insert tensor for each dense_slots + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + framework::LoDTensor lod_tensor; + size_t width = batch_dense_data[0][i].size(); + auto* tensor_data = lod_tensor.mutable_data( + framework::make_ddim( + {static_cast(batch_dense_data.size()), // batch_size + static_cast(width)}), + platform::CPUPlace()); + + for (size_t j = 0; j < batch_dense_data.size(); ++j) { + auto& dense_data_row = batch_dense_data[j][i]; + memcpy(tensor_data + j * width, dense_data_row.data(), + width * sizeof(float)); + } + + lod_datas.push_back(lod_tensor); + } + + // insert tensor for each sparse_slots + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + std::vector lod_data{0}; + std::vector batch_feasign; + + for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) { + auto& sparse_ids = batch_sparse_data[row_idx][i]; + lod_data.push_back(lod_data.back() + sparse_ids.size()); + batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(), + sparse_ids.end()); + } + + framework::LoDTensor lod_tensor; + framework::LoD lod{lod_data}; + lod_tensor.set_lod(lod); + int64_t* tensor_data = lod_tensor.mutable_data( + framework::make_ddim({static_cast(batch_feasign.size()), 1}), + platform::CPUPlace()); + memcpy(tensor_data, batch_feasign.data(), + batch_feasign.size() * sizeof(int64_t)); + lod_datas.push_back(lod_tensor); + } + + queue->Push(lod_datas); + VLOG(4) << "push one data, queue_size=" << queue->Size(); + } +} + void ReadThread(const std::vector& file_list, const DataDesc& data_desc, int thread_id, std::vector* thread_status, @@ -276,6 +386,8 @@ void ReadThread(const std::vector& file_list, if (data_desc.file_format_ == "svm") { ReadSvmData(data_desc, reader, queue); + } else if (data_desc.file_format_ == "csv") { + ReadCsvData(data_desc, reader, queue); } (*thread_status)[thread_id] = Stopped; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index dfdaae3a04..9f3a254c84 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -159,3 +159,71 @@ TEST(CTR_READER, read_data) { &reader); reader.Shutdown(); } + +static void GenereteCsvData(const std::string& file_name, + const std::vector& data) { + std::ofstream out(file_name.c_str()); + PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name); + for (auto& c : data) { + out << c; + } + out.close(); + PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name); +} + +static void CheckReadCsvOut(const std::vector& out) { + ASSERT_EQ(out.size(), 3); + ASSERT_EQ(out[0].dims()[1], 1); + ASSERT_EQ(out[1].dims()[1], 2); + ASSERT_EQ(out[2].dims()[1], 1); + for (size_t i = 0; i < out[0].numel(); ++i) { + int64_t label = out[0].data()[i]; + auto& dense_dim = out[1].dims(); + for (size_t j = 0; j < dense_dim[1]; ++j) { + ASSERT_EQ(out[1].data()[i * dense_dim[1] + j], + static_cast(label + 0.1)); + } + auto& sparse_lod = out[2].lod(); + for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) { + ASSERT_EQ(out[2].data()[j], label); + } + } +} + +TEST(CTR_READER, read_csv_data) { + std::string file_name = "test_ctr_reader_data.csv"; + const std::vector csv_data = { + "0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n", + "3 3.1,3.1 3,3,3,3\n", + }; + GenereteCsvData(file_name, csv_data); + + LoDTensorBlockingQueueHolder queue_holder; + int capacity = 64; + queue_holder.InitOnce(capacity, false); + + std::shared_ptr queue = queue_holder.GetQueue(); + + int batch_size = 3; + int thread_num = 1; + std::vector file_list; + for (int i = 0; i < thread_num; ++i) { + file_list.push_back(file_name); + } + DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {}); + + CTRReader reader(queue, thread_num, data_desc); + + for (size_t i = 0; i < 2; ++i) { + reader.Start(); + std::vector out; + while (true) { + reader.ReadNext(&out); + if (out.empty()) { + break; + } + CheckReadCsvOut(out); + } + reader.Shutdown(); + } +} From daba57f752b72be55fc5cdad86de2d5f52bb261c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 4 Dec 2018 16:50:59 +0800 Subject: [PATCH 008/123] complete ctr_reader --- .../operators/reader/create_ctr_reader_op.cc | 5 ++- paddle/fluid/operators/reader/ctr_reader.cc | 6 +-- paddle/fluid/operators/reader/ctr_reader.h | 36 +++++++++++++++- paddle/fluid/operators/reader/read_op.cc | 41 +++++++++++------- .../operators/reader/reader_op_registry.cc | 34 +++++++++------ paddle/fluid/pybind/pybind.cc | 1 + python/paddle/fluid/contrib/__init__.py | 3 ++ .../paddle/fluid/contrib/reader/__init__.py | 19 +++++++++ .../paddle/fluid/contrib/reader/ctr_reader.py | 42 +++++++++++++++---- python/setup.py.in | 1 + 10 files changed, 143 insertions(+), 45 deletions(-) create mode 100644 python/paddle/fluid/contrib/reader/__init__.py diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index 5b9e2ba693..2a3e80c915 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -51,6 +51,7 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto file_list = Attr>("file_list"); DataDesc data_desc(batch_size, file_list, file_type, file_format, dense_slot_index, sparse_slot_index, sparse_slots); + VLOG(1) << data_desc; out->Reset(std::make_shared(queue_holder->GetQueue(), thread_num, data_desc)); } @@ -69,10 +70,10 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { "The list of files that need to read"); AddAttr>( "dense_slot_index", - "the sparse slots id that should be extract from file") + "the dense slots id that should be extract from file") .SetDefault({}); AddAttr>( - "dense_slot_index", + "sparse_slot_index", "the sparse slots id that should be extract from file") .SetDefault({}); AddAttr>("sparse_slots", diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 3595d771b4..946f17750e 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -157,8 +157,8 @@ void MonitorThread(std::vector* thread_status, } std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } - VLOG(3) << "all reader thread is stopped, push empty data into queue"; - queue->Push({}); + VLOG(3) << "all reader thread is stopped, close the queue"; + queue->Close(); VLOG(3) << "monitor thread exited"; } @@ -247,7 +247,7 @@ static inline void parse_csv_line( int slot_idx = data_desc.dense_slot_index_[i]; auto& slot_data = ret[slot_idx]; std::vector data_in_slot_str; - string_split(ret[slot_idx], ',', &data_in_slot_str); + string_split(slot_data, ',', &data_in_slot_str); std::vector data_in_slot; for (auto& data_str : data_in_slot_str) { (*dense_datas)[i].push_back(std::stof(data_str)); diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 1f4663e3b8..eef6c11aaa 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -60,6 +60,35 @@ struct DataDesc { const std::vector sparse_slot_ids_; }; +inline std::ostream& operator<<(std::ostream& os, const DataDesc& data_desc) { + os << "data_desc:\n"; + os << "\tbatch_size -> " << data_desc.batch_size_ << "\n"; + os << "\tfile_type -> " << data_desc.file_type_ << "\n"; + os << "\tfile_format -> " << data_desc.file_format_ << "\n"; + os << "\tfile_names -> {"; + for (auto& file_name : data_desc.file_names_) { + os << file_name << ","; + } + os << "}\n"; + os << "\tdense_slot_index -> {"; + for (auto& slot : data_desc.dense_slot_index_) { + os << slot << ","; + } + os << "}\n"; + os << "\tsparse_slot_index_ -> {"; + for (auto& slot : data_desc.sparse_slot_index_) { + os << slot << ","; + } + os << "}\n"; + os << "\tsparse_slot_ids_ -> {"; + for (auto& slot : data_desc.sparse_slot_ids_) { + os << slot << ","; + } + os << "}\n"; + + return os; +} + void ReadThread(const std::vector& file_list, const DataDesc& data_desc, int thread_id, std::vector* thread_status, @@ -89,7 +118,7 @@ class CTRReader : public framework::FileReader { } } - ~CTRReader() {} + ~CTRReader() { Shutdown(); } void ReadNext(std::vector* out) override { bool success; @@ -106,7 +135,10 @@ class CTRReader : public framework::FileReader { for (auto& read_thread : read_threads_) { read_thread->join(); } - monitor_thread_->join(); + + if (monitor_thread_) { + monitor_thread_->join(); + } read_threads_.clear(); monitor_thread_.reset(nullptr); diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index a0b70938d3..97faade042 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -27,15 +27,16 @@ class ReadInferShape : public framework::InferShapeBase { "The ReadOp must take a reader as input."); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "The ReadOp should be assigned with output."); - std::vector reader_dims = ctx->GetReaderDims("Reader"); - std::vector out_names = ctx->Outputs("Out"); - PADDLE_ENFORCE_EQ( - reader_dims.size(), out_names.size(), - "The reader's dim number doesn't match the output number."); - ctx->SetOutputsDim("Out", reader_dims); - if (!ctx->IsRuntime()) { + if (!ctx->IsRuntime() && ctx->Attrs().Get("infer_out")) { + std::vector reader_dims = ctx->GetReaderDims("Reader"); + std::vector out_names = ctx->Outputs("Out"); + PADDLE_ENFORCE_EQ( + reader_dims.size(), out_names.size(), + "The reader's dim number doesn't match the output number."); + ctx->SetOutputsDim("Out", reader_dims); auto in_desc = boost::get(ctx->GetInputVarPtrs("Reader")[0]); + std::cout << in_desc->Proto()->SerializeAsString() << std::endl; auto in_lod_levels = in_desc->GetLoDLevels(); auto out_var_ptrs = ctx->GetOutputVarPtrs("Out"); PADDLE_ENFORCE_EQ(in_lod_levels.size(), out_var_ptrs.size(), @@ -53,15 +54,18 @@ class ReadInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { - std::string reader_name = op_desc.Input("Reader")[0]; - std::vector out_names = op_desc.Output("Out"); - framework::VarDesc* reader = block->FindVarRecursive(reader_name); - auto dtypes = reader->GetDataTypes(); - PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); - for (size_t i = 0; i < dtypes.size(); ++i) { - framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetDataType(dtypes[i]); + bool infer_out = boost::get(op_desc.GetAttr("infer_out")); + if (infer_out) { + std::string reader_name = op_desc.Input("Reader")[0]; + std::vector out_names = op_desc.Output("Out"); + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + auto dtypes = reader->GetDataTypes(); + PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + for (size_t i = 0; i < dtypes.size(); ++i) { + framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); + out.SetType(framework::proto::VarType::LOD_TENSOR); + out.SetDataType(dtypes[i]); + } } } }; @@ -73,6 +77,7 @@ class ReadOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { + VLOG(3) << "read op in"; framework::ReaderHolder* reader = detail::Ref(scope.FindVar(Input("Reader")), "Cannot find reader variable %s", Input("Reader")) @@ -87,7 +92,9 @@ class ReadOp : public framework::OperatorBase { reader->ReadNext(&ins); if (ins.empty()) { + VLOG(3) << "read empty data in"; if (Attr("throw_eof_exp")) { + VLOG(3) << "throw_eof_exp"; PADDLE_THROW_EOF(); } else { ins.resize(out_arg_names.size()); @@ -96,6 +103,7 @@ class ReadOp : public framework::OperatorBase { tensor.mutable_data(framework::make_ddim({0}), dev_place); } } + VLOG(3) << "read empty data out"; } PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) { @@ -120,6 +128,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker { " only when the data-balance is enabled in ParallelExecutor" " and it is set by ParallelExecutor instance, not users.") .SetDefault(true); + AddAttr("infer_out", "").SetDefault(true); AddComment(R"DOC( Read Operator diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index b82aab1214..3921eedf94 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() { "It means the reader will generate two data each time," "whose shapes are [2,3,4] and [5,6] respectively."); AddAttr>("lod_levels", "The LoD levels of each data."); + AddAttr( + "use_data_config", + "Use the config of all datas like shape_concat/ranks/lod_levels") + .SetDefault(true); Apply(); } @@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasOutput("Out"), "The output file reader should not be null."); - const auto shape_concat = ctx->Attrs().Get>("shape_concat"); - const auto ranks = ctx->Attrs().Get>("ranks"); - std::vector shapes = RestoreShapes(shape_concat, ranks); - ctx->SetReaderDims("Out", shapes); - - const auto lod_levels = ctx->Attrs().Get>("lod_levels"); - PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), - "The number of 'lod_levels'(%d) doesn't match the number " - "of 'shapes'(%d).", - lod_levels.size(), shapes.size()); - framework::VarDesc* reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - reader->SetLoDLevels(lod_levels); + bool use_data_config = ctx->Attrs().Get("use_data_config"); + if (use_data_config) { + const auto shape_concat = + ctx->Attrs().Get>("shape_concat"); + const auto ranks = ctx->Attrs().Get>("ranks"); + std::vector shapes = RestoreShapes(shape_concat, ranks); + ctx->SetReaderDims("Out", shapes); + + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); + } } void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f0a5d1afc9..681b213b46 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -364,6 +364,7 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::reference); py::class_(m, "Reader", "") + .def("start", &framework::ReaderHolder::Start) .def("reset", &framework::ReaderHolder::ResetAll); using LoDTensorBlockingQueue = diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 3bf2fe5db0..5d4b157727 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -22,9 +22,12 @@ from . import op_frequence from .op_frequence import * from . import quantize from .quantize import * +from . import reader +from .reader import * __all__ = [] __all__ += decoder.__all__ __all__ += memory_usage_calc.__all__ __all__ += op_frequence.__all__ __all__ += quantize.__all__ +__all__ += reader.__all__ diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/fluid/contrib/reader/__init__.py new file mode 100644 index 0000000000..4cf85ffc16 --- /dev/null +++ b/python/paddle/fluid/contrib/reader/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import ctr_reader + +__all__ = ctr_reader.__all__ diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py index d7133562de..aad8ded87d 100644 --- a/python/paddle/fluid/contrib/reader/ctr_reader.py +++ b/python/paddle/fluid/contrib/reader/ctr_reader.py @@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \ default_startup_program, Variable from paddle.fluid.unique_name import generate as unique_name +__all__ = ['ctr_reader'] + def monkey_patch_reader_methods(reader): def __get_reader__(): @@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader): def reset(): return __get_reader__().reset() + def start(): + return __get_reader__().start() + reader.reset = reset + reader.start = start reader.stop_gradient = True reader.persistable = True return reader @@ -44,13 +50,18 @@ def _copy_reader_var_(block, var): return new_var -def ctr_reader(feed_data, - capacity, - thread_num, - batch_size, - file_list, - slots, - name=None): +def ctr_reader( + feed_dict, + file_type, # gzip or plain + file_format, # csv or svm + dense_slot_indexs, + sparse_slot_indexs, + capacity, + thread_num, + batch_size, + file_list, + slots, + name=None): """ Create a CTR reader for data feeding in Python @@ -99,12 +110,22 @@ def ctr_reader(feed_data, inputs={'blocking_queue': [queue_name]}, outputs={'Out': [reader_var]}, attrs={ + 'use_data_config': False, 'thread_num': thread_num, 'batch_size': batch_size, 'file_list': file_list, - 'slots': slots, + 'file_type': file_type, + 'file_format': file_format, + 'dense_slot_index': dense_slot_indexs, + 'sparse_slot_index': sparse_slot_indexs, + 'sparse_slots': slots, + 'ranks': [], + 'lod_levels': [], + 'shape_concat': [] }) + dtypes = [data.dtype for data in feed_dict] + reader_var.desc.set_dtypes(dtypes) reader_var.persistable = True main_prog_reader_var = _copy_reader_var_( @@ -118,6 +139,9 @@ def ctr_reader(feed_data, main_blk = default_main_program().current_block() main_blk.append_op( - type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data}) + type='read', + inputs={'Reader': [reader]}, + attrs={'infer_out': False}, + outputs={'Out': feed_dict}) return reader diff --git a/python/setup.py.in b/python/setup.py.in index 200b96ec54..d5d82f643e 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -107,6 +107,7 @@ packages=['paddle', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', + 'paddle.fluid.contrib.reader', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details'] From 7b7fe01cae07393430d6f3062497dff19233eeba Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 4 Dec 2018 16:58:02 +0800 Subject: [PATCH 009/123] optimize code --- paddle/fluid/operators/reader/ctr_reader.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 946f17750e..65b300d152 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -101,16 +101,16 @@ class GzipReader : public Reader { class PlainFileReader : public Reader { public: explicit PlainFileReader(const std::string& file_name) - : myfile_(file_name.c_str()) {} + : stream_(file_name.c_str()) {} ~PlainFileReader() {} - bool HasNext() override { return myfile_.peek() != EOF; } + bool HasNext() override { return stream_.peek() != EOF; } - void NextLine(std::string* line) override { std::getline(myfile_, *line); } + void NextLine(std::string* line) override { std::getline(stream_, *line); } private: - std::ifstream myfile_; + std::ifstream stream_; }; template From 9af76ade4c93c60faa7a92f0e720721c6f8c1cc5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 5 Dec 2018 17:58:42 +0800 Subject: [PATCH 010/123] fix unused var --- paddle/fluid/operators/reader/ctr_reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 65b300d152..ca9a58615e 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -258,7 +258,7 @@ static inline void parse_csv_line( int slot_idx = data_desc.sparse_slot_index_[i]; auto& slot_data = ret[slot_idx]; std::vector data_in_slot_str; - string_split(ret[slot_idx], ',', &data_in_slot_str); + string_split(slot_data, ',', &data_in_slot_str); std::vector data_in_slot; for (auto& data_str : data_in_slot_str) { (*sparse_datas)[i].push_back(std::stol(data_str)); From 05208e1f2bd7fb77b5427353edc8c3f28ef9a23b Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 6 Dec 2018 13:00:28 +0800 Subject: [PATCH 011/123] optimize code test=develop --- paddle/fluid/operators/reader/ctr_reader.cc | 3 ++- paddle/fluid/operators/reader/read_op.cc | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index ca9a58615e..e8edbf6602 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -261,7 +261,8 @@ static inline void parse_csv_line( string_split(slot_data, ',', &data_in_slot_str); std::vector data_in_slot; for (auto& data_str : data_in_slot_str) { - (*sparse_datas)[i].push_back(std::stol(data_str)); + auto id = std::stol(data_str); + (*sparse_datas)[i].push_back(id); } } } diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index 97faade042..8fe638ac2f 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -36,7 +36,6 @@ class ReadInferShape : public framework::InferShapeBase { ctx->SetOutputsDim("Out", reader_dims); auto in_desc = boost::get(ctx->GetInputVarPtrs("Reader")[0]); - std::cout << in_desc->Proto()->SerializeAsString() << std::endl; auto in_lod_levels = in_desc->GetLoDLevels(); auto out_var_ptrs = ctx->GetOutputVarPtrs("Out"); PADDLE_ENFORCE_EQ(in_lod_levels.size(), out_var_ptrs.size(), From f60e55c04681e7ed900687a42b5ce95f8ba3a6b5 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 6 Dec 2018 19:02:45 +0800 Subject: [PATCH 012/123] add ctr_reader to api spec test=develop --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 26113ee7e9..9d0fad75ba 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -333,6 +333,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_indexs', 'sparse_slot_indexs', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) From 2dd55b873fcad8fb7e06963d6ea08ba17e7ce1b7 Mon Sep 17 00:00:00 2001 From: shippingwang Date: Mon, 17 Dec 2018 13:08:02 +0000 Subject: [PATCH 013/123] Add shuffle_channel_op --- paddle/fluid/operators/shuffle_channel_op.cc | 126 +++++++++++ paddle/fluid/operators/shuffle_channel_op.cu | 24 ++ paddle/fluid/operators/shuffle_channel_op.h | 101 +++++++++ python/paddle/fluid/layers/nn.py | 213 ++++++------------ .../fluid/tests/unittests/test_layers.py | 9 + .../unittests/test_shuffle_channel_op.py | 54 +++++ 6 files changed, 385 insertions(+), 142 deletions(-) create mode 100644 paddle/fluid/operators/shuffle_channel_op.cc create mode 100644 paddle/fluid/operators/shuffle_channel_op.cu create mode 100644 paddle/fluid/operators/shuffle_channel_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc new file mode 100644 index 0000000000..ec1255af16 --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -0,0 +1,126 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/shuffle_channel_op.h" + +namespace paddle { +namespace operators { + +class ShuffleChannelOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx - > HasInput("X"), + "Input(X) of ShuffleChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Out"), + "Output(Out) of ShuffleChannelOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + // ENFORCE group + auto group = ctx->Attrs().Get>("group"); + ctx->SetOutputDim("Out", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.GetPlace()); + } +}; + +class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), " + "the input feature data of ShuffleChannelOp, the layout is NCHW."); + AddOutput("Out", + "(Tensor, default Tensor), the output of " + "ShuffleChannelOp. The layout is NCHW."); + AddAttr("group", "the number of groups.") + .SetDefault(1) + .AddCustomChecker([](const int& group) { + PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0."); + }); + + AddComment(R"DOC( + Shuffle Channel operator + This operator obtains the group convolutional layer with channels shuffled. + First, divide the input channels in each group into several subgroups, + then, feed each group in the next layer with different subgroups. + + According to the paper, "Suppose a convolution layer with g groups + whose output has g x n channels, first reshape the output channel dimension into(g,n), + transposing and then flattening it back as the input of next layer. " + + Shuffle channel operation makes it possible to build more powerful structures + with multiple group convolutional layers. + + please get more information from the following paper: + https://arxiv.org/pdf/1707.01083.pdf + )DOC"); + } +}; + +// Grad + +class ShuffleChannelOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) should not be null") + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad) should not be null"); + + auto input_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Out")) + ->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +// how to write gpu kernal +namespace ops = paddle::operators; +REGISTER_OPERATOR(shufflechannel, ops::ShuffleChannelOp, + ops::ShuffleChannelOpMaker, + paddle::framework::DefaultGradOpDescMaker); +// paddle::framework::EmptyGradOpMaker); + +REGISTER_OPERATOR(shufflechannel_grad, ops::ShuffleChannelGradOp); + +REGISTER_OP_CPU_KERNEL( + shufflechannel, + ops::ShuffleChannelOpKernel, + ops::ShuffleChannelOpKernel); + +REGISTER_OP_CPU_KERNEL( + shufflechannel_grad, + ops::ShuffleChannelGradOpKernel, + ops::ShuffleChannelGradOpKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu new file mode 100644 index 0000000000..b1eacd0cbe --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/shuffle_channel_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + shufflechannel, + ops::ShuffleChannelOpKernel + ops::ShuffleChannelOpKernel); +REGISTER_OP_CUDA_KERNEL( + shufflechannel_grad, + ops::ShuffleChannelOpGradKernel + ops::ShuffleChannelOpGradKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h new file mode 100644 index 0000000000..f923babf5b --- /dev/null +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class ShuffleChannelOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto group = ctx.Input("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channels / group_row; + + const T* input_data = input->data(); + T* output_data = out->mutable_data(ctx.GetPlace()); + + for (int n = 0; n < num; ++n) { + output_data_temp = output_data + n * feature_map_size; + input_data_temp = input_data + n * feature_map_size; + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_column; ++j) { + const auto* p_i = input_data_temp + (i * group_column + j) * sp_sz; + auto* p_o = output_data_temp + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(Dtype) * sp_sz); + } + } + } + return; + } +}; + +template +class ShuffleChannelGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto group = ctx.Input("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channels / group_row; + + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + const T* output_grad_data = output_grad->data(); + + for (int n = 0; n < num; ++n) { + output_grad_temp = output_grad_data + n * feature_map_size; + input_grad_temp = input_grad_data + n * feature_map_size; + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_column; ++j) { + const auto* p_i = output_grad_temp + (i * group_column + j) * sp_sz; + auto* p_o = input_grad_temp + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(Dtype) * sp_sz); + } + } + } + return; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e25eaaa9fd..5e1b6c999b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -31,148 +31,37 @@ from functools import reduce from .. import core __all__ = [ - 'fc', - 'embedding', - 'dynamic_lstm', - 'dynamic_lstmp', - 'dynamic_gru', - 'gru_unit', - 'linear_chain_crf', - 'crf_decoding', - 'cos_sim', - 'cross_entropy', - 'bpr_loss', - 'square_error_cost', - 'chunk_eval', - 'sequence_conv', - 'conv2d', - 'conv3d', - 'sequence_pool', - 'sequence_softmax', - 'softmax', - 'pool2d', - 'pool3d', - 'batch_norm', - 'beam_search_decode', - 'conv2d_transpose', - 'conv3d_transpose', - 'sequence_expand', - 'sequence_expand_as', - 'sequence_pad', - 'sequence_unpad', - 'lstm_unit', - 'reduce_sum', - 'reduce_mean', - 'reduce_max', - 'reduce_min', - 'reduce_prod', - 'sequence_first_step', - 'sequence_last_step', - 'sequence_slice', - 'dropout', - 'split', - 'ctc_greedy_decoder', - 'edit_distance', - 'l2_normalize', - 'matmul', - 'topk', - 'warpctc', - 'sequence_reshape', - 'transpose', - 'im2sequence', - 'nce', - 'hsigmoid', - 'beam_search', - 'row_conv', - 'multiplex', - 'layer_norm', - 'group_norm', - 'softmax_with_cross_entropy', - 'smooth_l1', - 'one_hot', - 'autoincreased_step_counter', - 'reshape', - 'squeeze', - 'unsqueeze', - 'lod_reset', - 'lrn', - 'pad', - 'pad_constant_like', - 'label_smooth', - 'roi_pool', - 'roi_align', - 'dice_loss', - 'image_resize', - 'image_resize_short', - 'resize_bilinear', - 'resize_nearest', - 'gather', - 'scatter', - 'sequence_scatter', - 'random_crop', - 'mean_iou', - 'relu', - 'selu', - 'log', - 'crop', - 'rank_loss', - 'margin_rank_loss', - 'elu', - 'relu6', - 'pow', - 'stanh', - 'hard_sigmoid', - 'swish', - 'prelu', - 'brelu', - 'leaky_relu', - 'soft_relu', - 'flatten', - 'sequence_mask', - 'stack', - 'pad2d', - 'unstack', - 'sequence_enumerate', - 'expand', - 'sequence_concat', - 'scale', - 'elementwise_add', - 'elementwise_div', - 'elementwise_sub', - 'elementwise_mul', - 'elementwise_max', - 'elementwise_min', - 'elementwise_pow', - 'uniform_random_batch_size_like', - 'gaussian_random', - 'sampling_id', - 'gaussian_random_batch_size_like', - 'sum', - 'slice', - 'shape', - 'logical_and', - 'logical_or', - 'logical_xor', - 'logical_not', - 'clip', - 'clip_by_norm', - 'mean', - 'mul', - 'sigmoid_cross_entropy_with_logits', - 'maxout', - 'space_to_depth', - 'affine_grid', - 'sequence_reverse', - 'affine_channel', - 'similarity_focus', - 'hash', - 'grid_sampler', - 'log_loss', - 'add_position_encoding', - 'bilinear_tensor_product', - 'merge_selected_rows', - 'get_tensor_from_selected_rows', - 'lstm', + 'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', + 'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy', + 'bpr_loss', 'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', + 'conv3d', 'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', + 'pool3d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', + 'conv3d_transpose', 'sequence_expand', 'sequence_expand_as', 'sequence_pad', + 'sequence_unpad', 'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', + 'reduce_min', 'reduce_prod', 'sequence_first_step', 'sequence_last_step', + 'sequence_slice', 'dropout', 'split', 'ctc_greedy_decoder', 'edit_distance', + 'l2_normalize', 'matmul', 'topk', 'warpctc', 'sequence_reshape', + 'transpose', 'im2sequence', 'nce', 'hsigmoid', 'beam_search', 'row_conv', + 'multiplex', 'layer_norm', 'group_norm', 'softmax_with_cross_entropy', + 'smooth_l1', 'one_hot', 'autoincreased_step_counter', 'reshape', 'squeeze', + 'unsqueeze', 'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', + 'roi_pool', 'roi_align', 'dice_loss', 'image_resize', 'image_resize_short', + 'resize_bilinear', 'resize_nearest', 'gather', 'scatter', + 'sequence_scatter', 'random_crop', 'mean_iou', 'relu', 'selu', 'log', + 'crop', 'rank_loss', 'margin_rank_loss', 'elu', 'relu6', 'pow', 'stanh', + 'hard_sigmoid', 'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', + 'flatten', 'sequence_mask', 'stack', 'pad2d', 'unstack', + 'sequence_enumerate', 'expand', 'sequence_concat', 'scale', + 'elementwise_add', 'elementwise_div', 'elementwise_sub', 'elementwise_mul', + 'elementwise_max', 'elementwise_min', 'elementwise_pow', + 'uniform_random_batch_size_like', 'gaussian_random', 'sampling_id', + 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape', 'logical_and', + 'logical_or', 'logical_xor', 'logical_not', 'clip', 'clip_by_norm', 'mean', + 'mul', 'sigmoid_cross_entropy_with_logits', 'maxout', 'space_to_depth', + 'affine_grid', 'sequence_reverse', 'affine_channel', 'similarity_focus', + 'hash', 'grid_sampler', 'log_loss', 'add_position_encoding', + 'bilinear_tensor_product', 'merge_selected_rows', + 'get_tensor_from_selected_rows', 'lstm', 'shufflechannel' ] kIgnoreIndex = -100 @@ -9122,3 +9011,43 @@ def get_tensor_from_selected_rows(x, name=None): outputs={'Out': out}, attrs={}) return out + + +def shuffle_channel(x, group=1, name=None): + """ + **Shuffle Channel Operator** + This operator obtains the group convolutional layer with channels shuffled. + First, divide the input channels in each group into several subgroups, + then, feed each group in the next layer with different subgroups. + Shuffle channel operation makes it possible to build more powerful structures + with multiple group convolutional layers. + + Args: + x: The input tensor variable. + + + Returns: + Variable: channel shuffled tensor variable. + + Raises: + ValueError: If group in not a int type variable. + + Examples: + .. code-block:: python + + + """ + helper = LayerHelper("shuffle_channel", **locals()) + + out = helper.create_variable_for_type_inference( + dtype=helper.intput_dtype('x')) + + if not isinstance(group, int): + raise TypeError("group must be int type") + + helper.append_op( + type="shuffle_channel", + inputs={"X": x}, + outputs={"Out": out}, + attrs={"group": group}) + return out diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 10e8bb5a86..155f59f6fe 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -982,6 +982,15 @@ class TestBook(unittest.TestCase): print(str(program)) + def test_shuffle_channel(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[10, 32, 16, 16], dtype="float32") + group = layers.data(name="group", shape=[1], dtype="int32") + out = layers.shuffle_channel(x, group) + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py new file mode 100644 index 0000000000..25df22193c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py @@ -0,0 +1,54 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest +import paddle.fluid.core as core + + +class TestShuffleChannelOp(OpTest): + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'output') + + def setUp(self): + self.op_type = "shuffle_channel" + self.batch_size = 10 + self.input_channels = 16 + self.layer_h = 32 + self.layer_w = 32 + self.group = 4 + + self.x = np.random.random( + (self.batch_size, self.input_channels, self.layer_h, self, + layer_w)).astype('float32') + self.inputs = {'X': self.x} + self.attrs = {'group': self.group} + + n, c, h, w = self.x.shape + input_reshaped = np.reshape(self.x, + (-1, self.group, c // self.group, h, w)) + input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4)) + self.outputs = np.reshape(input_transposed, (-1, c, h, w)) + + +if __name__ == '__main__': + unittest.main() From 16d4e13711fc6df796d458f1bce033614217485d Mon Sep 17 00:00:00 2001 From: shippingwang Date: Mon, 24 Dec 2018 19:33:52 +0000 Subject: [PATCH 014/123] Add ShuffleChannelOP --- paddle/fluid/operators/shuffle_channel_op.cc | 60 +++++----- paddle/fluid/operators/shuffle_channel_op.cu | 112 +++++++++++++++++- paddle/fluid/operators/shuffle_channel_op.h | 36 +++--- python/paddle/fluid/layers/nn.py | 10 +- .../fluid/tests/unittests/test_layers.py | 2 +- .../unittests/test_shuffle_channel_op.py | 30 +++-- 6 files changed, 173 insertions(+), 77 deletions(-) diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index ec1255af16..0ede3922ea 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -19,26 +19,27 @@ class ShuffleChannelOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx - > HasInput("X"), + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ShuffleChannelOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Out"), + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ShuffleChannelOp should not be null."); auto input_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); // ENFORCE group - auto group = ctx->Attrs().Get>("group"); + // auto group = ctx->Attrs().Get("group"); ctx->SetOutputDim("Out", input_dims); } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.GetPlace()); - } + /* + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + */ }; class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { @@ -63,7 +64,7 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { then, feed each group in the next layer with different subgroups. According to the paper, "Suppose a convolution layer with g groups - whose output has g x n channels, first reshape the output channel dimension into(g,n), + whose output has g * n channels, first reshape the output channel dimension into(g,n), transposing and then flattening it back as the input of next layer. " Shuffle channel operation makes it possible to build more powerful structures @@ -75,52 +76,49 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { } }; -// Grad - -class ShuffleChannelOpGrad : public framework::OperatorWithKernel { +class ShuffleChannelGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@Grad) should not be null") + "Input(Out@Grad) should not be null"); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@Grad) should not be null"); auto input_dims = ctx->GetInputDim("X"); ctx->SetOutputDim(framework::GradVarName("X"), input_dims); } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Out")) - ->type()), - ctx.device_context()); - } + /* + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + */ }; } // namespace operators } // namespace paddle -// how to write gpu kernal namespace ops = paddle::operators; -REGISTER_OPERATOR(shufflechannel, ops::ShuffleChannelOp, +REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp, ops::ShuffleChannelOpMaker, paddle::framework::DefaultGradOpDescMaker); // paddle::framework::EmptyGradOpMaker); -REGISTER_OPERATOR(shufflechannel_grad, ops::ShuffleChannelGradOp); +REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); REGISTER_OP_CPU_KERNEL( - shufflechannel, + shuffle_channel, ops::ShuffleChannelOpKernel, ops::ShuffleChannelOpKernel); REGISTER_OP_CPU_KERNEL( - shufflechannel_grad, + shuffle_channel_grad, ops::ShuffleChannelGradOpKernel, ops::ShuffleChannelGradOpKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index b1eacd0cbe..77418ac7e3 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -10,15 +10,115 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template + +__global__ void ShuffleChannel(const int nthreads, const int feature_map_size, + T* output, const T* input, int group_row, + int group_column, int len) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t ii = index; ii < nthreads; ii += offset) { + const int n = index / group_row / group_column / len; + const int i = (index / group_column / len) % group_row; + const int j = index / len % group_column; + const int k = index - (n * feature_map_size + (i * group_column + j) * len); + T* p_o = output + n * feature_map_size + (j * group_row + i) * len; + p_o[k] = input[index]; + } +} +template +class ShuffleChannelOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + int group = ctx.Attr("group"); + + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + int group_row = group; + int group_column = channel / group_row; + // count is the product of NCHW same as numel() + int count = num * group_column * group_row * sp_sz; + + int blocks = NumBlocks(output->numel()); + int threads = kNumCUDAThreads; + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + ShuffleChannel< + T><<>>( + count, feature_map_size, output_data, input_data, group_row, + group_column, sp_sz); + } +}; + +template +class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + int group = ctx.Attr("group"); + auto input_dims = input->dims(); + auto num = input_dims[0]; + auto channel = input_dims[1]; + auto height = input_dims[2]; + auto weight = input_dims[3]; + auto feature_map_size = channel * height * weight; + auto sp_sz = height * weight; + + int group_row = group; + int group_column = channel / group_row; + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + const T* output_grad_data = output_grad->data(); + + int blocks = NumBlocks(output_grad->numel()); + int threads = kNumCUDAThreads; + int count = num * group_column * group_row * sp_sz; + ShuffleChannel< + T><<>>( + count, feature_map_size, input_grad_data, output_grad_data, group_row, + group_column, sp_sz); + } +}; +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - shufflechannel, - ops::ShuffleChannelOpKernel - ops::ShuffleChannelOpKernel, + ops::ShuffleChannelOpCUDAKernel); REGISTER_OP_CUDA_KERNEL( - shufflechannel_grad, - ops::ShuffleChannelOpGradKernel - ops::ShuffleChannelOpGradKernel, + ops::ShuffleChannelGradOpCUDAKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h index f923babf5b..5c161c0005 100644 --- a/paddle/fluid/operators/shuffle_channel_op.h +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -21,10 +21,10 @@ namespace operators { template class ShuffleChannelOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { + void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); - auto group = ctx.Input("group"); + int group = ctx.Attr("group"); auto input_dims = input->dims(); auto num = input_dims[0]; @@ -34,21 +34,19 @@ class ShuffleChannelOpKernel : public framework::OpKernel { auto feature_map_size = channel * height * weight; auto sp_sz = height * weight; - int group_row = group; - int group_column = channels / group_row; + int group_column = channel / group_row; const T* input_data = input->data(); - T* output_data = out->mutable_data(ctx.GetPlace()); - + T* output_data = output->mutable_data(ctx.GetPlace()); for (int n = 0; n < num; ++n) { - output_data_temp = output_data + n * feature_map_size; - input_data_temp = input_data + n * feature_map_size; for (int i = 0; i < group_row; ++i) { for (int j = 0; j < group_column; ++j) { - const auto* p_i = input_data_temp + (i * group_column + j) * sp_sz; - auto* p_o = output_data_temp + (j * group_row + i) * sp_sz; - memcpy(p_o, p_i, sizeof(Dtype) * sp_sz); + const T* p_i = input_data + n * feature_map_size + + (i * group_column + j) * sp_sz; + T* p_o = + output_data + n * feature_map_size + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(int) * sp_sz); } } } @@ -61,7 +59,7 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); - auto group = ctx.Input("group"); + int group = ctx.Attr("group"); auto input_dims = input->dims(); auto num = input_dims[0]; @@ -72,7 +70,7 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel { auto sp_sz = height * weight; int group_row = group; - int group_column = channels / group_row; + int group_column = channel / group_row; auto* output_grad = ctx.Input(framework::GradVarName("Out")); @@ -81,19 +79,17 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); const T* output_grad_data = output_grad->data(); - for (int n = 0; n < num; ++n) { - output_grad_temp = output_grad_data + n * feature_map_size; - input_grad_temp = input_grad_data + n * feature_map_size; for (int i = 0; i < group_row; ++i) { for (int j = 0; j < group_column; ++j) { - const auto* p_i = output_grad_temp + (i * group_column + j) * sp_sz; - auto* p_o = input_grad_temp + (j * group_row + i) * sp_sz; - memcpy(p_o, p_i, sizeof(Dtype) * sp_sz); + const T* p_i = output_grad_data + n * feature_map_size + + (i * group_column + j) * sp_sz; + T* p_o = input_grad_data + n * feature_map_size + + (j * group_row + i) * sp_sz; + memcpy(p_o, p_i, sizeof(int) * sp_sz); } } } - return; } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 57d210eab8..fd7cddeffb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -173,7 +173,7 @@ __all__ = [ 'merge_selected_rows', 'get_tensor_from_selected_rows', 'lstm', - 'shufflechannel', + 'shuffle_channel', 'psroi_pool', ] @@ -9334,17 +9334,20 @@ def shuffle_channel(x, group=1, name=None): with multiple group convolutional layers. Args: - x: The input tensor variable. + x: The input tensor variable.. + group: The num of group Returns: Variable: channel shuffled tensor variable. Raises: - ValueError: If group in not a int type variable. + ValueError: If group in not an int type variable. Examples: .. code-block:: python + + out = fluid.layers.shuffle_channel(x=group_conv,group=4) """ @@ -9361,6 +9364,7 @@ def shuffle_channel(x, group=1, name=None): inputs={"X": x}, outputs={"Out": out}, attrs={"group": group}) + return out @templatedoc() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index e2edba030b..7ade135ec3 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1018,7 +1018,7 @@ class TestBook(unittest.TestCase): def test_shuffle_channel(self): program = Program() with program_guard(program): - x = layers.data(name="x", shape=[10, 32, 16, 16], dtype="float32") + x = layers.data(name="x", shape=[1, 4, 2, 2], dtype="float32") group = layers.data(name="group", shape=[1], dtype="int32") out = layers.shuffle_channel(x, group) self.assertIsNotNone(out) diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py index 25df22193c..4fabe424fa 100644 --- a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py +++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py @@ -23,31 +23,29 @@ import paddle.fluid.core as core class TestShuffleChannelOp(OpTest): - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'output') - def setUp(self): self.op_type = "shuffle_channel" - self.batch_size = 10 - self.input_channels = 16 - self.layer_h = 32 - self.layer_w = 32 - self.group = 4 - + self.batch_size = 1 + self.input_channels = 4 + self.layer_h = 2 + self.layer_w = 2 + self.group = 2 self.x = np.random.random( - (self.batch_size, self.input_channels, self.layer_h, self, - layer_w)).astype('float32') + (self.batch_size, self.input_channels, self.layer_h, + self.layer_w)).astype('float32') self.inputs = {'X': self.x} self.attrs = {'group': self.group} - n, c, h, w = self.x.shape input_reshaped = np.reshape(self.x, (-1, self.group, c // self.group, h, w)) input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4)) - self.outputs = np.reshape(input_transposed, (-1, c, h, w)) + self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') if __name__ == '__main__': From 5a9ea9a73d51841790940ffb36790d8424adacba Mon Sep 17 00:00:00 2001 From: shippingwang Date: Tue, 25 Dec 2018 02:25:45 +0000 Subject: [PATCH 015/123] Add ShuffleChannel Op --- paddle/fluid/operators/shuffle_channel_op.cc | 6 +++++- paddle/fluid/operators/shuffle_channel_op.cu | 2 ++ python/paddle/fluid/layers/nn.py | 10 +++++----- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 0ede3922ea..1ab8b42d8d 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -28,7 +28,7 @@ class ShuffleChannelOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); // ENFORCE group - // auto group = ctx->Attrs().Get("group"); + ctx->SetOutputDim("Out", input_dims); } /* @@ -87,6 +87,10 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { "Output(X@Grad) should not be null"); auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + // ENFORCE group + ctx->SetOutputDim(framework::GradVarName("X"), input_dims); } /* diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index 77418ac7e3..e8badc40cd 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -81,6 +81,7 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); int group = ctx.Attr("group"); + auto input_dims = input->dims(); auto num = input_dims[0]; auto channel = input_dims[1]; @@ -101,6 +102,7 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { int blocks = NumBlocks(output_grad->numel()); int threads = kNumCUDAThreads; int count = num * group_column * group_row * sp_sz; + ShuffleChannel< T><<>>( count, feature_map_size, input_grad_data, output_grad_data, group_row, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3e3eea084e..e654047df6 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -55,6 +55,8 @@ __all__ = [ 'softmax', 'pool2d', 'pool3d', + 'adaptive_pool2d', + 'adaptive_pool3d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', @@ -9342,24 +9344,22 @@ def shuffle_channel(x, group=1, name=None): x: The input tensor variable.. group: The num of group - Returns: - Variable: channel shuffled tensor variable. + Variable: channels shuffled tensor variable. Raises: - ValueError: If group in not an int type variable. + ValueError: If group is not an int type variable. Examples: .. code-block:: python out = fluid.layers.shuffle_channel(x=group_conv,group=4) - """ helper = LayerHelper("shuffle_channel", **locals()) out = helper.create_variable_for_type_inference( - dtype=helper.intput_dtype('x')) + dtype=helper.input_dtype('x')) if not isinstance(group, int): raise TypeError("group must be int type") From 42909dae8785223a2d9464a246aebd08f94532da Mon Sep 17 00:00:00 2001 From: shippingwang Date: Tue, 25 Dec 2018 05:46:54 +0000 Subject: [PATCH 016/123] add API.spec, test=develop --- paddle/fluid/API.spec | 109 ++++++------------------------------------ 1 file changed, 14 insertions(+), 95 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b6974c6af2..b6b7af9510 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,27 +26,12 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None -paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None -paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None -paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None +paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) -paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) -paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) -paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -74,7 +59,6 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) -paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) @@ -85,9 +69,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) -paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -111,18 +93,17 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)) paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)) paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)) +paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False)) paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) -paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) -paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) +paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None)) +paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) -paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)) -paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False)) +paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) @@ -137,17 +118,15 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) -paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None)) +paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR')) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) -paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -193,24 +172,15 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) -paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) -paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -219,7 +189,6 @@ paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, k paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)) paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -229,7 +198,6 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)) paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)) @@ -300,7 +268,6 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) -paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)) paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) @@ -315,7 +282,6 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -351,39 +317,6 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) -paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) -paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)) -paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.__init__ ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.delete ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.download ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)) -paddle.fluid.contrib.HDFSClient.is_dir ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.HDFSClient.is_exist ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.HDFSClient.ls ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.lsr ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)) -paddle.fluid.contrib.HDFSClient.make_local_dirs ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.makedirs ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.rename ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.contrib.HDFSClient.upload ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)) -paddle.fluid.contrib.multi_download ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)) -paddle.fluid.contrib.multi_upload ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) @@ -400,7 +333,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) -paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)) +paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) @@ -410,7 +343,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) @@ -469,17 +402,3 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable -paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) -paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) -paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) -paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) -paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) -paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) -paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) -paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) -paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) -paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) -paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) From c2575ac72de5095dd6259ed0c3e6162b0ec7b2e6 Mon Sep 17 00:00:00 2001 From: shippingwang Date: Tue, 25 Dec 2018 08:20:26 +0000 Subject: [PATCH 017/123] update API.spec, test=develop --- paddle/fluid/API.spec | 110 ++++++++++++++++--- paddle/fluid/operators/shuffle_channel_op.cc | 11 +- paddle/fluid/operators/shuffle_channel_op.cu | 3 +- paddle/fluid/operators/shuffle_channel_op.h | 3 +- 4 files changed, 101 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b6b7af9510..b1e8a986fb 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -26,12 +26,27 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None -paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None -paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None -paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None +paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) +paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) +paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) +paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -59,6 +74,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) +paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) @@ -69,7 +85,9 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -93,17 +111,18 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)) paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)) paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False)) +paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)) paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) -paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None)) -paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) +paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) -paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False)) +paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)) +paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) @@ -118,15 +137,17 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) -paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR')) +paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None)) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) -paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -172,15 +193,25 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) +paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) +paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(1, None)) +paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -189,6 +220,7 @@ paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, k paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)) +paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)) paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -198,6 +230,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)) paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)) @@ -268,6 +301,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) +paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)) paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) @@ -282,6 +316,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -317,6 +352,39 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) +paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) +paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)) +paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.__init__ ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.delete ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.download ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)) +paddle.fluid.contrib.HDFSClient.is_dir ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.HDFSClient.is_exist ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.HDFSClient.ls ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.lsr ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)) +paddle.fluid.contrib.HDFSClient.make_local_dirs ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.makedirs ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.rename ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.contrib.HDFSClient.upload ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)) +paddle.fluid.contrib.multi_download ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)) +paddle.fluid.contrib.multi_upload ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) @@ -333,7 +401,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) -paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) +paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) @@ -343,7 +411,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) @@ -402,3 +470,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) +paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) +paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) +paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) +paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) +paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) +paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) +paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) +paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) +paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) +paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 1ab8b42d8d..8449efe4a9 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -27,8 +27,6 @@ class ShuffleChannelOp : public framework::OperatorWithKernel { auto input_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); - // ENFORCE group - ctx->SetOutputDim("Out", input_dims); } /* @@ -60,11 +58,11 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Shuffle Channel operator This operator obtains the group convolutional layer with channels shuffled. - First, divide the input channels in each group into several subgroups, + Firstly, divide the input channels in each group into several subgroups, then, feed each group in the next layer with different subgroups. - According to the paper, "Suppose a convolution layer with g groups - whose output has g * n channels, first reshape the output channel dimension into(g,n), + According to the paper, "Suppose a convolution layer with G groups + whose output has (G * N) channels, first reshape the output channel dimension into(G,N), transposing and then flattening it back as the input of next layer. " Shuffle channel operation makes it possible to build more powerful structures @@ -89,8 +87,6 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { auto input_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); - // ENFORCE group - ctx->SetOutputDim(framework::GradVarName("X"), input_dims); } /* @@ -112,7 +108,6 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp, ops::ShuffleChannelOpMaker, paddle::framework::DefaultGradOpDescMaker); -// paddle::framework::EmptyGradOpMaker); REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index e8badc40cd..9506343b3d 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -26,7 +26,6 @@ static inline int NumBlocks(const int N) { } template - __global__ void ShuffleChannel(const int nthreads, const int feature_map_size, T* output, const T* input, int group_row, int group_column, int len) { diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h index 5c161c0005..dbb4495e33 100644 --- a/paddle/fluid/operators/shuffle_channel_op.h +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -76,7 +76,6 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel { ctx.Input(framework::GradVarName("Out")); auto* input_grad = ctx.Output(framework::GradVarName("X")); - T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); const T* output_grad_data = output_grad->data(); for (int n = 0; n < num; ++n) { From 942d7cf7ef0e817f3f77cdfbbfb684435554f407 Mon Sep 17 00:00:00 2001 From: shippingwang Date: Tue, 25 Dec 2018 11:46:19 +0000 Subject: [PATCH 018/123] Modify python interface, test=develop --- python/paddle/fluid/layers/nn.py | 2 +- python/paddle/fluid/tests/unittests/test_layers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e654047df6..7f7086641a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9359,7 +9359,7 @@ def shuffle_channel(x, group=1, name=None): helper = LayerHelper("shuffle_channel", **locals()) out = helper.create_variable_for_type_inference( - dtype=helper.input_dtype('x')) + dtype=helper.input_dtype('X')) if not isinstance(group, int): raise TypeError("group must be int type") diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index bc4005122e..daf4a9c824 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1018,7 +1018,7 @@ class TestBook(unittest.TestCase): def test_shuffle_channel(self): program = Program() with program_guard(program): - x = layers.data(name="x", shape=[10, 16, 4, 4], dtype="float32") + x = layers.data(name="X", shape=[10, 16, 4, 4], dtype="float32") out = layers.shuffle_channel(x, group=2) self.assertIsNotNone(out) print(str(program)) From 3757c1ee479ddb67b622db03b41d3300f3fab62d Mon Sep 17 00:00:00 2001 From: shippingwang Date: Tue, 25 Dec 2018 17:10:54 +0000 Subject: [PATCH 019/123] Modify test layers, test=develop --- python/paddle/fluid/layers/nn.py | 7 +++---- python/paddle/fluid/tests/unittests/test_layers.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 7f7086641a..63ff5f07e7 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9341,8 +9341,8 @@ def shuffle_channel(x, group=1, name=None): with multiple group convolutional layers. Args: - x: The input tensor variable.. - group: The num of group + x(Variable): The input tensor variable. + group(Integer): The num of group. Returns: Variable: channels shuffled tensor variable. @@ -9358,8 +9358,7 @@ def shuffle_channel(x, group=1, name=None): """ helper = LayerHelper("shuffle_channel", **locals()) - out = helper.create_variable_for_type_inference( - dtype=helper.input_dtype('X')) + out = helper.create_variable_for_type_inference(dtype=x.dtype) if not isinstance(group, int): raise TypeError("group must be int type") diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index daf4a9c824..9e392fa8e0 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1018,8 +1018,8 @@ class TestBook(unittest.TestCase): def test_shuffle_channel(self): program = Program() with program_guard(program): - x = layers.data(name="X", shape=[10, 16, 4, 4], dtype="float32") - out = layers.shuffle_channel(x, group=2) + x = layers.data(name="X", shape=[16, 4, 4], dtype="float32") + out = layers.shuffle_channel(x, group=4) self.assertIsNotNone(out) print(str(program)) From b53eb7dcda6cb2c8e5a3f49bee15189fc2232401 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 26 Dec 2018 19:43:38 +0800 Subject: [PATCH 020/123] add init once for assign layer --- python/paddle/fluid/layers/nn.py | 8 ++-- python/paddle/fluid/layers/tensor.py | 39 ++++++++++++++----- .../fluid/tests/unittests/test_layers.py | 12 ++++++ 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index cc1fdbd285..00523c0798 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5010,10 +5010,12 @@ def nce(input, alias_probs_[little[0]] = 1.0 alias_[little[0]] = -1 - probs = assign(input=np.array(custom_dist).astype('float32')) - custom_alias = assign(input=np.array(alias_).astype('int32')) + probs = assign( + input=np.array(custom_dist).astype('float32'), init_once=True) + custom_alias = assign( + input=np.array(alias_).astype('int32'), init_once=True) custom_alias_probs = assign( - input=np.array(alias_probs_).astype('float32')) + input=np.array(alias_probs_).astype('float32'), init_once=True) inputs['CustomDistProbs'] = probs inputs['CustomDistAlias'] = custom_alias diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 49a486cf0c..d66d92b1df 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -285,7 +285,7 @@ def sums(input, out=None): return out -def assign(input, output=None): +def assign(input, output=None, init_once=False): """ **Assign** @@ -294,6 +294,7 @@ def assign(input, output=None): Args: input(Variable|numpy.ndarray): The source variable output(Variable|None): The destination variable + init_once(bool|false): assign value into global var only in startup program. Returns: Variable: The destination variable that was supplied as the *output*. @@ -307,10 +308,18 @@ def assign(input, output=None): """ helper = LayerHelper('assign', **locals()) if output is None: - output = helper.create_variable_for_type_inference(dtype=input.dtype) + if init_once: + output = helper.create_parameter( + attr=ParamAttr(), shape=input.shape, dtype=input.dtype) + else: + output = helper.create_variable_for_type_inference( + dtype=input.dtype) if isinstance(input, Variable): + if init_once: + raise ValueError("init once only support numpy assign!") helper.append_op( type='assign', inputs={'X': [input]}, outputs={'Out': [output]}) + elif isinstance(input, numpy.ndarray): dtype = convert_np_dtype_to_dtype_(input.dtype) if dtype == VarDesc.VarType.FP32: @@ -325,14 +334,24 @@ def assign(input, output=None): raise ValueError("The size of input is too big. Please consider " "saving it to file and 'load_op' to load it") - helper.append_op( - type='assign_value', - outputs={'Out': [output]}, - attrs={ - 'dtype': dtype, - 'shape': list(input.shape), - value_name: values - }) + if init_once: + helper.startup_program.global_block().append_op( + type='assign_value', + outputs={'Out': [output]}, + attrs={ + 'dtype': dtype, + 'shape': list(input.shape), + value_name: values + }) + else: + helper.append_op( + type='assign_value', + outputs={'Out': [output]}, + attrs={ + 'dtype': dtype, + 'shape': list(input.shape), + value_name: values + }) else: raise ValueError("Wrong type for assign input: %s" % type(input)) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index e180822c2b..92065abb9b 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1015,6 +1015,18 @@ class TestBook(unittest.TestCase): print(str(program)) + def test_assign(self): + import numpy as np + startup = Program() + main = Program() + with program_guard(main, startup): + probs = layers.assign( + input=np.random.random([1, 2]).astype('float32'), + init_once=True) + + print(str(main)) + print(str(startup)) + if __name__ == '__main__': unittest.main() From 031995cf589b936891aecedb5e13cd93f42ec4eb Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 26 Dec 2018 22:10:03 +0800 Subject: [PATCH 021/123] fix --- python/paddle/fluid/layers/nn.py | 2 +- python/paddle/fluid/layers/tensor.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 00523c0798..ee165d092c 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -26,7 +26,7 @@ from ..initializer import Normal, Constant from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ -from .tensor import concat +from .tensor import concat, assign from . import utils from .. import unique_name from functools import reduce diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index d66d92b1df..5d5657eae5 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -310,7 +310,10 @@ def assign(input, output=None, init_once=False): if output is None: if init_once: output = helper.create_parameter( - attr=ParamAttr(), shape=input.shape, dtype=input.dtype) + attr=ParamAttr(), + shape=input.shape, + dtype=input.dtype, + default_initializer=Constant(0.0)) else: output = helper.create_variable_for_type_inference( dtype=input.dtype) From 0384f3309a68c40c2c7e88c317dc536e3279e8e0 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 26 Dec 2018 22:36:02 +0800 Subject: [PATCH 022/123] enable unit test for test_nce test=develop --- python/paddle/fluid/layers/tensor.py | 1 + python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 5d5657eae5..3e36fb7632 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -314,6 +314,7 @@ def assign(input, output=None, init_once=False): shape=input.shape, dtype=input.dtype, default_initializer=Constant(0.0)) + output.stop_gradient = True else: output = helper.create_variable_for_type_inference( dtype=input.dtype) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d6fe245d8..5d0fd7b1b1 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -32,7 +32,6 @@ endif() list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 -list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 From 9322d34032d2157486841bd0ddd45ca9a420db92 Mon Sep 17 00:00:00 2001 From: shippingwang Date: Wed, 26 Dec 2018 17:22:17 +0000 Subject: [PATCH 023/123] Fix, test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/shuffle_channel_op.cc | 33 +++++++++----------- paddle/fluid/operators/shuffle_channel_op.h | 1 - python/paddle/fluid/layers/nn.py | 4 +-- 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b1e8a986fb..a480a14ecc 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -208,7 +208,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) -paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(1, None)) +paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 8449efe4a9..9b0631d5ff 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -29,15 +29,13 @@ class ShuffleChannelOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", input_dims); } - /* - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); - } - */ + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } }; class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { @@ -89,16 +87,13 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("X"), input_dims); } - /* - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); - } - */ + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h index dbb4495e33..f6af1bc885 100644 --- a/paddle/fluid/operators/shuffle_channel_op.h +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -50,7 +50,6 @@ class ShuffleChannelOpKernel : public framework::OpKernel { } } } - return; } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4d0ed7c55c..9ebbb35c07 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9335,13 +9335,13 @@ def get_tensor_from_selected_rows(x, name=None): return out -def shuffle_channel(x, group=1, name=None): +def shuffle_channel(x, group, name=None): """ **Shuffle Channel Operator** This operator obtains the group convolutional layer with channels shuffled. First, divide the input channels in each group into several subgroups, then, feed each group in the next layer with different subgroups. - Shuffle channel operation makes it possible to build more powerful structures + Channel shuffling operation makes it possible to build more powerful structures with multiple group convolutional layers. Args: From 83f2e2c903d2040f1c664b4b44947d9a7a6a3650 Mon Sep 17 00:00:00 2001 From: shippingwang Date: Fri, 28 Dec 2018 07:52:09 +0000 Subject: [PATCH 024/123] rewrite the comments, test=develop --- paddle/fluid/operators/shuffle_channel_op.cc | 11 ++--- python/paddle/fluid/layers/nn.py | 52 +++++++++++++++----- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 9b0631d5ff..9349912e09 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -55,17 +55,12 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Shuffle Channel operator - This operator obtains the group convolutional layer with channels shuffled. - Firstly, divide the input channels in each group into several subgroups, - then, feed each group in the next layer with different subgroups. - - According to the paper, "Suppose a convolution layer with G groups - whose output has (G * N) channels, first reshape the output channel dimension into(G,N), - transposing and then flattening it back as the input of next layer. " + This opearator shuffles the channels of input x. + It divide the input channels in each group into several subgroups, + and obtain a new order by selecting element from every subgroup one by one. Shuffle channel operation makes it possible to build more powerful structures with multiple group convolutional layers. - please get more information from the following paper: https://arxiv.org/pdf/1707.01083.pdf )DOC"); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9ebbb35c07..6f5aeaa527 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9338,27 +9338,57 @@ def get_tensor_from_selected_rows(x, name=None): def shuffle_channel(x, group, name=None): """ **Shuffle Channel Operator** - This operator obtains the group convolutional layer with channels shuffled. - First, divide the input channels in each group into several subgroups, - then, feed each group in the next layer with different subgroups. - Channel shuffling operation makes it possible to build more powerful structures - with multiple group convolutional layers. + This operator shuffles the channels of input x. + It divide the input channels in each group into :attr:`group` subgroups, + and obtain a new order by selecting element from every subgroup one by one. + + Please refer to the paper + https://arxiv.org/pdf/1707.01083.pdf + .. code-block:: text + Given a 4-D tensor input with the shape (N, C, H, W): + input.shape = (1, 4, 2, 2) + input.data =[[[[0.1, 0.2], + [0.2, 0.3]], + + [[0.3, 0.4], + [0.4, 0.5]], + + [[0.5, 0.6], + [0.6, 0.7]], + + [[0.7, 0.8], + [0.8, 0.9]]]] + Given group: 2 + then we get a 4-D tensor out whth the same shape of input: + out.shape = (1, 4, 2, 2) + out.data = [[[[0.1, 0.2], + [0.2, 0.3]], + + [[0.5, 0.6], + [0.6, 0.7]], + + [[0.3, 0.4], + [0.4, 0.5]], + + [[0.7, 0.8], + [0.8, 0.9]]]] + Args: - x(Variable): The input tensor variable. - group(Integer): The num of group. + x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W] + group(int): Indicating the conuts of subgroups, It should divide the number of channels. Returns: - Variable: channels shuffled tensor variable. + out(Variable): the channels shuffling result is a tensor variable with the + same shape and same type as the input. Raises: ValueError: If group is not an int type variable. Examples: .. code-block:: python - - out = fluid.layers.shuffle_channel(x=group_conv,group=4) - + input = fluid.layers.data(name='input', shape=[1,4,2,2], dtype='float32') + out = fluid.layers.shuffle_channel(x=input, group=2) """ helper = LayerHelper("shuffle_channel", **locals()) From 908684a535a162f54f3e01b449779dda1853de85 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 9 Jan 2019 14:27:49 +0800 Subject: [PATCH 025/123] change the largest size of assign --- python/paddle/fluid/layers/tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 3e36fb7632..4f73194d82 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -334,7 +334,7 @@ def assign(input, output=None, init_once=False): values = [int(v) for v in input.flat] else: raise ValueError("Unsupported dtype %s", input.dtype) - if input.size > 1024 * 1024: + if input.size > 1024 * 1024 * 5: raise ValueError("The size of input is too big. Please consider " "saving it to file and 'load_op' to load it") From 4e3522e5b4484d91ae5e519612a32f53600d2293 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 9 Jan 2019 09:05:33 +0000 Subject: [PATCH 026/123] add trt int8 support test=develop --- paddle/fluid/inference/analysis/argument.h | 3 + paddle/fluid/inference/analysis/helper.h | 15 ++ .../inference/analysis/ir_pass_manager.cc | 11 +- .../ir_passes/tensorrt_subgraph_pass.cc | 29 +++- paddle/fluid/inference/api/analysis_config.cc | 7 +- .../fluid/inference/api/analysis_predictor.cc | 57 +++++++ .../fluid/inference/api/analysis_predictor.h | 3 + .../inference/api/paddle_analysis_config.h | 4 +- .../fluid/inference/tensorrt/CMakeLists.txt | 2 +- paddle/fluid/inference/tensorrt/engine.cc | 7 + paddle/fluid/inference/tensorrt/engine.h | 57 +++---- .../inference/tensorrt/trt_int8_calibrator.cc | 144 ++++++++++++++++++ .../inference/tensorrt/trt_int8_calibrator.h | 128 ++++++++++++++++ .../operators/tensorrt/tensorrt_engine_op.cc | 9 +- .../operators/tensorrt/tensorrt_engine_op.h | 88 ++++++++++- 15 files changed, 514 insertions(+), 50 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc create mode 100644 paddle/fluid/inference/tensorrt/trt_int8_calibrator.h diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2d8980b1d1..b06ff63a74 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -104,6 +104,7 @@ struct Argument { DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); + DECL_ARGUMENT_FIELD(model_path, ModelPath, std::string); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); @@ -126,6 +127,8 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, + std::string); // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 269a0da9f9..5df3aacc3f 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -156,6 +156,21 @@ static bool PathExists(const std::string &path) { return false; } +static std::string SplitPath(const std::string path) { + char sep = '/'; + +#ifdef _WIN32 + sep = '\\'; +#endif + + size_t i = path.rfind(sep, path.length()); + if (i != std::string::npos) { + return (path.substr(0, i)); + } + + return path; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index e37fea38bc..a996055774 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -67,9 +67,17 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", new int(argument->tensorrt_min_subgraph_size())); + pass->Set( + "program", + new framework::ProgramDesc *( + const_cast(&argument->main_program()))); + pass->Set("precision_mode", + new std::string(argument->tensorrt_precision_mode())); + pass->Set("model_dir", new std::string(argument->model_path())); } // graph_ = pass->Apply(std::move(graph_)); + pre_pass = pass_name; passes_.emplace_back(std::move(pass)); @@ -94,7 +102,8 @@ framework::proto::ProgramDesc IRPassManager::AcquireProgram( auto pass = framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); - ProgramDesc desc(program); + ProgramDesc desc; + desc.CopyFrom(*const_cast(program).Proto()); pass->SetNotOwned("program", &desc); auto *the_graph = graph->release(); *graph = pass->Apply(std::unique_ptr(the_graph)); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index bc06e78ae6..634c5ead0a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -72,13 +72,23 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); + framework::ProgramDesc *program_desc = + Get("program"); + // Add new block for TensorRTEngineOP + const framework::BlockDesc &main_block = + program_desc->Block(framework::kRootBlockIndex); + // const framework::BlockDesc& main_block = program_desc->Block(0); + framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); + // An fake block desc. framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_idx(0); for (auto *node : subgraph) { + auto *new_block_op = new_block->AppendOp(); auto *op = block_desc.AppendOp(); + *new_block_op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto(); } @@ -178,7 +188,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // to Tensor. std::vector output_mapping; for (auto name : output_names) { - // LOG(INFO) << name << " " << output_name_map.size(); PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } @@ -189,9 +198,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, *vars->Add() = *node->Var()->Proto(); } } + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); + op_desc->SetBlockAttr("sub_block", new_block); // Set attrs SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); @@ -199,6 +210,22 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + + std::string engine_key = std::to_string( + std::hash()(block_desc.Proto()->SerializeAsString())); + std::string precision_mode = Get("precision_mode"); + SetAttr(op_desc->Proto(), "calibration_data", std::string("")); + std::string trt_calib_file = + Get("model_dir") + "/trt_calib_" + engine_key; + if (precision_mode == "INT8" && FileExists(trt_calib_file)) { + std::ifstream infile(trt_calib_file, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string calibration_data(buffer.str()); + SetAttr(op_desc->Proto(), "calibration_data", calibration_data); + } + SetAttr(op_desc->Proto(), "precision_mode", precision_mode); + SetAttr(op_desc->Proto(), "engine_key", engine_key); } std::vector ExtractParameters( diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 211c691504..399db291fd 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -86,6 +86,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); + CP_MEMBER(tensorrt_precision_mode_); // MKLDNN releated. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -123,10 +124,13 @@ void contrib::AnalysisConfig::EnableMKLDNN() { void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, int max_batch_size, - int min_subgraph_size) { + int min_subgraph_size, + std::string precision_mode) { use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; + tensorrt_precision_mode_ = precision_mode; + Update(); } void contrib::AnalysisConfig::Update() { @@ -176,6 +180,7 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ss << use_tensorrt_; ss << tensorrt_workspace_size_; ss << tensorrt_max_batchsize_; + ss << tensorrt_precision_mode_; ss << use_mkldnn_; ss << enable_ir_optim_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 585634fae9..75c62bb98c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #endif +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" @@ -41,6 +44,10 @@ DECLARE_bool(profile); namespace paddle { using contrib::AnalysisConfig; +using inference::Singleton; +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorRes; +using inference::tensorrt::TRTCalibratorResManager; namespace { bool IsPersistable(const framework::VarDesc *var) { @@ -321,11 +328,15 @@ void AnalysisPredictor::OptimizeInferenceProgram() { // Analyze inference_program if (!config_.model_dir().empty()) { argument_.SetModelDir(config_.model_dir()); + argument_.SetModelPath(config_.model_dir()); } else { PADDLE_ENFORCE( !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file().empty()); + std::string dir = inference::analysis::SplitPath(config_.prog_file()); + + argument_.SetModelPath(dir); argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelParamsPath(config_.params_file()); } @@ -335,6 +346,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); + argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); } if (config_.use_mkldnn_) { @@ -550,7 +562,52 @@ bool AnalysisPredictor::LoadParameters() { return true; } +bool AnalysisPredictor::SaveTrtCalibToDisk() { + PADDLE_ENFORCE(config_.tensorrt_engine_enabled(), + "This func can be invoked only in trt mode"); + auto &block = inference_program_->Block(0); + for (auto &op_desc : block.AllOps()) { + if (op_desc->Type() == "tensorrt_engine") { + std::string engine_name = + boost::get(op_desc->GetAttr("engine_key")); + if (!Singleton::Global().Has(engine_name)) { + LOG(ERROR) << "You should run the predictor(with trt) on the real data " + "to generate calibration info"; + return false; + } + TRTCalibratorRes *calib_res = + Singleton::Global().Get(engine_name); + LOG(INFO) << "Wait for calib threads done."; + calib_res->calib_->waitAndSetDone(); + LOG(INFO) << "Finish wait."; + calib_res->thr_->join(); + std::string calibration_data = + calib_res->calib_->getCalibrationTableAsString(); + + if (calibration_data.size() == 0) { + LOG(ERROR) << "the calibration table is empty."; + return false; + } + std::string calibration_data_path = + argument_.model_path() + "/trt_calib_" + engine_name; + std::ofstream ofile(calibration_data_path, std::ios::out); + LOG(INFO) << "Write Paddle-TRT INT8 calibration data to file " + << calibration_data_path; + ofile << calibration_data; + ofile.close(); + } + } + // Free all calibrator resources. + Singleton::Global().DeleteALL(); + return true; +} + AnalysisPredictor::~AnalysisPredictor() { + if (config_.tensorrt_engine_enabled() && + config_.tensorrt_precision_mode_ == "INT8" && + Singleton::Global().Has()) { + SaveTrtCalibToDisk(); + } if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index a6e126c5d5..cec36a0d3a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -90,6 +90,9 @@ class AnalysisPredictor : public PaddlePredictor { template void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); + + bool SaveTrtCalibToDisk(); + ~AnalysisPredictor(); // Some more detailed tests, they are made the friends of the predictor, so that diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index ae6ac69854..14b16d08b3 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -135,7 +135,8 @@ struct AnalysisConfig { * subgraph is less than this, it will not transfer to TensorRT engine. */ void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1, int min_subgraph_size = 3); + int max_batch_size = 1, int min_subgraph_size = 3, + std::string precision = "FP32"); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -231,6 +232,7 @@ struct AnalysisConfig { // We set this variable to control the minimum number of nodes in the // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; + std::string tensorrt_precision_mode_; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 9afeafd176..f4977d08c4 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,4 @@ -nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) +nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index f739752cbc..43f99df463 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -70,6 +70,13 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); + if (precision_mode_ == "INT8") { + infer_builder_->setInt8Mode(true); + PADDLE_ENFORCE( + calibrator_ != nullptr, + "The precision mode is 'INT8', the calibrator should not be nullptr"); + infer_builder_->setInt8Calibrator(calibrator_); + } infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index f5b2c28ba9..9aed374dce 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -23,12 +23,14 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { namespace inference { namespace tensorrt { +class TRTInt8Calibrator; /* * TensorRT Engine. * @@ -56,12 +58,16 @@ class TensorRTEngine : public EngineBase { TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream = nullptr, int device = 0, + std::string precision_mode = "FP32", + TRTInt8Calibrator* calibrator = nullptr, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), stream_(stream ? stream : &default_stream_), - logger_(logger), - device_(device) { + device_(device), + precision_mode_(precision_mode), + calibrator_(calibrator), + logger_(logger) { freshDeviceId(); cudaStreamCreate(stream_); } @@ -142,8 +148,8 @@ class TensorRTEngine : public EngineBase { // In the normal case, the paddle-trt exists bug when runing the googlenet. // When there are more than two convolutions of 1 * 1 with the same input, the // paddle-tensorrt will do the merging optimization, which fuse those conv - // into - // one conv, and then trigger bug. So, We should use strategy to avoid this + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this // optimization for the time being. This bug will be fixed in the future. std::unordered_map itensor_quote_num; @@ -156,11 +162,16 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; - // batch size of the current data, will be updated each Executation. - int batch_size_{-1}; cudaStream_t* stream_; // If stream_ is not set from outside, hold its own stream. cudaStream_t default_stream_; + // The specific GPU id that the TensorRTEngine bounded to. + int device_; + + std::string precision_mode_; + TRTInt8Calibrator* calibrator_; + // batch size of the current data, will be updated each Executation. + int batch_size_{-1}; nvinfer1::ILogger& logger_; std::vector buffers_; @@ -169,8 +180,6 @@ class TensorRTEngine : public EngineBase { std::unordered_map itensor_map_; - // The specific GPU id that the TensorRTEngine bounded to. - int device_; std::vector> owned_plugin_; // TensorRT related internal members @@ -208,38 +217,6 @@ class TensorRTEngine : public EngineBase { #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ engine__->network()->add##layer__(ARGS); -/* - * Helper to control the TensorRT engine's creation and deletion. - */ -class TRT_EngineManager { - public: - bool HasEngine(const std::string& name) const { - return engines_.count(name) != 0; - } - - // Get an engine called `name`. - TensorRTEngine* Get(const std::string& name) const { - return engines_.at(name).get(); - } - - // Create or get an engine called `name` - TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream, - const std::string& name, int gpu_device = 0) { - auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device); - engines_[name].reset(p); - return p; - } - - void DeleteALl() { - for (auto& item : engines_) { - item.second.reset(nullptr); - } - } - - private: - std::unordered_map> engines_; -}; - } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc new file mode 100644 index 0000000000..f935620020 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" +#include "glog/logging.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// set the batch size before constructing the thread to execute engine +int TRTInt8Calibrator::getBatchSize() const { return batch_size_; } + +TRTInt8Calibrator::TRTInt8Calibrator( + const std::unordered_map& buffers, int batch_size, + std::string engine_name, const platform::Place place) + : batch_size_(batch_size), + calib_running_(true), + data_is_set_(false), + done_(false), + engine_name_(engine_name) { + int i = 0; + VLOG(4) << "Init a new calibrator: " << engine_name_; + for (const auto it : buffers) { + framework::Tensor temp_tensor; + std::string input_name = it.first; + int data_size = it.second; + int num_ele = data_size / sizeof(int16_t); + framework::DDim data_shape = framework::make_ddim({num_ele}); + temp_tensor.Resize(data_shape); + data_tensors_.push_back(temp_tensor); + data_buffers_[input_name] = std::pair( + static_cast(temp_tensor.mutable_data(place)), num_ele); + i += 1; + } +} + +TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data) + : batch_size_(0), + calib_running_(false), + data_is_set_(false), + done_(true), + calibration_table_(calib_data) {} + +void TRTInt8Calibrator::waitAndSetDone() { + std::unique_lock lk(mut_); + while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk); + if (!done_) { + done_ = true; + cond_.notify_all(); + } +} + +bool TRTInt8Calibrator::setBatch( + const std::unordered_map& data) { + VLOG(3) << "set batch: " << engine_name_; + std::unique_lock lk(mut_); + while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk); + if (done_) return false; + + // Sets the batch. + for (const auto it : data) { + auto dataptr = data_buffers_.find(it.first); + if (dataptr == data_buffers_.end()) { + LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first + << "' does not match with the buffer names"; + } + + const auto& d = dataptr->second; + auto status = + cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice); + if (status != cudaSuccess) { + LOG(FATAL) << "cudaMemcpy " << engine_name_ << " for '" << it.first + << "' failed with " << status; + } + } + + data_is_set_ = true; + cond_.notify_all(); + return true; +} + +bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, + int num_bindings) { + VLOG(4) << "get batch: " << engine_name_; + std::unique_lock lk(mut_); + calib_running_ = false; + cond_.notify_all(); + + while (!data_is_set_ && !done_) cond_.wait(lk); + if (done_) return false; + + // Gets the batch + for (int i = 0; i < num_bindings; i++) { + auto it = data_buffers_.find(names[i]); + if (it == data_buffers_.end()) { + LOG(FATAL) << "Calibration engine asked for unknown tensor name '" + << names[i] << "' at position " << i; + } + bindings[i] = it->second.first; + } + + data_is_set_ = false; + calib_running_ = true; + VLOG(4) << "get batch done: " << engine_name_; + return true; +} + +void TRTInt8Calibrator::setDone() { + std::unique_lock lk(mut_); + done_ = true; + cond_.notify_all(); +} + +const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) { + if (calibration_table_.empty()) return nullptr; + length = calibration_table_.size(); + return calibration_table_.data(); +} + +void TRTInt8Calibrator::writeCalibrationCache(const void* ptr, + std::size_t length) { + calibration_table_ = std::string((const char*)ptr, length); + VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr + << " length=" << length; +} +TRTInt8Calibrator::~TRTInt8Calibrator() { + VLOG(4) << "Destroying calibrator for " << engine_name_; +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h new file mode 100644 index 0000000000..81ba9c7032 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TensorRTEngine; + +struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { + public: + TRTInt8Calibrator(const std::unordered_map& buffers, + int batch_size, std::string engine_name, + const platform::Place place); + + explicit TRTInt8Calibrator(const std::string& calibration_data); + ~TRTInt8Calibrator(); + + int getBatchSize() const override; + + bool getBatch(void* bindings[], const char* names[], + int num_bindings) override; + + bool setBatch(const std::unordered_map& data); + void setDone(); + void waitAndSetDone(); + + const void* readCalibrationCache(std::size_t& length) override; + void writeCalibrationCache(const void* ptr, std::size_t length) override; + const std::string& getCalibrationTableAsString() { + return calibration_table_; + } + + private: + const int batch_size_; + + bool calib_running_; + bool data_is_set_; + bool done_; + + std::mutex mut_; + std::condition_variable cond_; + + std::unordered_map> data_buffers_; + std::vector data_tensors_; + + std::string engine_name_; + std::string calibration_table_; +}; + +class TRTCalibratorRes { + public: + TRTCalibratorRes() {} + std::unique_ptr calib_; + std::unique_ptr thr_; + std::unique_ptr engine_; +}; +/* + * Manager to control the TensorRT Int8 calibration creation and deltetion. + */ +class TRTCalibratorResManager { + public: + bool Has() const { return res_.size() > 0; } + bool Has(const std::string& name) const { + if (res_.count(name) == 0) return false; + return res_.at(name).get() != nullptr; + } + + // Get Int8Calibrator via name + TRTCalibratorRes* Get(const std::string& name) const { + return res_.at(name).get(); + } + + // Look up or create a calibrator. + TRTCalibratorRes* LookupOrCreate(const std::string& engine_name) { + if (res_.count(engine_name) == 0) { + auto* p = new TRTCalibratorRes(); + res_[engine_name].reset(p); + } + return res_.at(engine_name).get(); + } + + // Create an Int8Calibrator + TRTCalibratorRes* Create(const std::string& engine_name) { + auto* p = new TRTCalibratorRes(); + res_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto& item : res_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> res_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index b993c55fad..ed177eb18f 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -29,8 +29,15 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); + AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_key", + "The engine_key here is used to distinguish different TRT Engines"); AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); + AddAttr("sub_block", "the trt block"); + AddAttr("precision_mode", + "the precision mode: 'FP32', 'INT8' "); AddComment("TensorRT engine operator."); } }; @@ -47,6 +54,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, - ops::TensorRTEngineOpMaker); + ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 88c4f50847..57747faec8 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -17,8 +17,10 @@ #ifdef PADDLE_WITH_CUDA #include +#include #include +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" @@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { using inference::Singleton; using inference::tensorrt::TensorRTEngine; +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorRes; +using inference::tensorrt::TRTCalibratorResManager; class TensorRTEngineOp : public framework::OperatorBase { private: @@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase { mutable std::unique_ptr trt_engine_; int max_batch_size_; int workspace_size_; + std::unique_ptr calibrator_; + std::string precision_mode_; + std::string calibration_data_; + std::string engine_key_; + bool calibration_mode_; public: TensorRTEngineOp(const std::string &type, @@ -80,26 +90,95 @@ class TensorRTEngineOp : public framework::OperatorBase { input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); + precision_mode_ = Attr("precision_mode"); + calibration_data_ = Attr("calibration_data"); + engine_key_ = Attr("engine_key"); auto params = Attr>("parameters"); for (const auto ¶m : params) { param_names_.insert(param); } + calibration_mode_ = + (precision_mode_ == "INT8" && calibration_data_.size() == 0); + + if (precision_mode_ == "INT8" && calibration_data_.size()) { + calibrator_.reset(new TRTInt8Calibrator(calibration_data_)); + } } protected: + void RunNative(const framework::Scope &scope, + const platform::Place &dev_place) const { + framework::Executor executor(dev_place); + auto *block = Attr("sub_block"); + auto *program = block->Program(); + auto *scope_ptr = const_cast(&scope); + auto ctx = executor.Prepare(*program, block->ID()); + executor.RunPreparedContext(ctx.get(), scope_ptr, false, true, true); + } + void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { + if (calibration_mode_ == true) { + RunCalibration(scope, dev_place); + return; + } RunTrt(scope, dev_place); } + void RunCalibration(const framework::Scope &scope, + const platform::Place &dev_place) const { + // Create calibrator here. + LOG(INFO) << "Running calibration trt int8 ..."; + int runtime_batch = 1; + if (!Singleton::Global().Has(engine_key_)) { + TRTCalibratorRes *calib_res = + Singleton::Global().Create(engine_key_); + std::unordered_map calib_buffers; + for (auto &x : input_names_) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + calib_buffers[x] = t.memory_size(); + auto t_shape = framework::vectorize(t.dims()); + runtime_batch = t_shape[0]; + } + calib_res->calib_.reset(new TRTInt8Calibrator( + calib_buffers, runtime_batch, engine_key_, dev_place)); + calib_res->thr_.reset(new std::thread([&]() { + calib_res->engine_.reset(new TensorRTEngine( + max_batch_size_, workspace_size_, nullptr, + boost::get(dev_place).device, precision_mode_, + calib_res->calib_.get())); + VLOG(3) << "start the calib trt engine thread"; + Prepare(scope, dev_place, calib_res->engine_.get()); + })); + } + + TRTInt8Calibrator *temp_calibrator = + Singleton::Global() + .Get(engine_key_) + ->calib_.get(); + std::unordered_map calib_data; + + for (auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + calib_data.emplace(x, t.data()); + } + temp_calibrator->setBatch(calib_data); + RunNative(scope, dev_place); + } + void RunTrt(const framework::Scope &scope, const platform::Place &dev_place) const { int runtime_batch = 1; if (trt_engine_.get() == nullptr) { - trt_engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, nullptr, - boost::get(dev_place).device)); + trt_engine_.reset( + new TensorRTEngine(max_batch_size_, workspace_size_, nullptr, + boost::get(dev_place).device, + precision_mode_, calibrator_.get())); Prepare(scope, dev_place, trt_engine_.get()); } @@ -168,7 +247,8 @@ class TensorRTEngineOp : public framework::OperatorBase { void Prepare(const framework::Scope &scope, const platform::Place &dev_place, TensorRTEngine *engine) const { - VLOG(4) << "Prepare engine"; + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; framework::proto::BlockDesc block_desc; block_desc.ParseFromString(Attr("subgraph")); From c1264e99f3a213b7cb8ff659aa998e17f47ef937 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 9 Jan 2019 10:20:49 +0000 Subject: [PATCH 027/123] fix win error test=develop --- paddle/fluid/inference/tensorrt/trt_int8_calibrator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h index 81ba9c7032..13f6e7ad01 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once -#include #include #include #include @@ -23,6 +22,7 @@ #include #include "NvInfer.h" +#include "cuda_runtime_api.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/place.h" From 422449a945d9c9a725f562c184f9be3a8a930db7 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 9 Jan 2019 18:40:50 +0800 Subject: [PATCH 028/123] fix style --- paddle/fluid/operators/reader/ctr_reader.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 6bcbaae38e..14c4809df2 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -153,9 +153,9 @@ class CTRReader : public framework::FileReader { VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; for (int thread_id = 0; thread_id < thread_num_; thread_id++) { - read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], data_desc_, static_cast(thread_id), - &read_thread_status_, queue_))); + read_threads_.emplace_back(new std::thread(std::bind( + &ReadThread, file_groups_[thread_id], data_desc_, + static_cast(thread_id), &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( std::bind(&MonitorThread, &read_thread_status_, queue_))); From 0a79d7a40429c1a8415925f795e09e859fcd2a3d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 9 Jan 2019 18:55:55 +0800 Subject: [PATCH 029/123] fix merge --- paddle/fluid/operators/reader/ctr_reader.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 14c4809df2..740cd5219c 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -102,15 +102,15 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: - explicit CTRReader(const std::shared_ptr& queue, - int batch_size, size_t thread_num, - const std::vector& slots, - const std::vector& file_list) - : batch_size_(batch_size), slots_(slots), file_list_(file_list) { + CTRReader(const std::shared_ptr& queue, + int thread_num, const DataDesc& data_desc) + : data_desc_(data_desc) { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); - PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = std::min(file_list_.size(), thread_num); + PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0, + "file list should not be empty"); + + thread_num_ = std::min(data_desc_.file_names_.size(), thread_num); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { From 653cd319711030f8953f3dff1cfb3afee336f6d1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 9 Jan 2019 19:02:53 +0800 Subject: [PATCH 030/123] remote unused code --- paddle/fluid/operators/reader/ctr_reader.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index e8edbf6602..f08798794a 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -310,8 +310,6 @@ void ReadCsvData(const DataDesc& data_desc, std::shared_ptr reader, platform::CPUPlace()); memcpy(label_tensor_data, batch_label.data(), batch_label.size() * sizeof(int64_t)); - auto dim = - framework::make_ddim({static_cast(batch_label.size()), 1}); lod_datas.push_back(label_tensor); // insert tensor for each dense_slots From 53e76a2eb60b8a6fb8a29f22ab6df6a9ca963e12 Mon Sep 17 00:00:00 2001 From: shippingwang Date: Sat, 29 Dec 2018 11:38:36 +0000 Subject: [PATCH 031/123] Fix comments display, test=develop --- python/paddle/fluid/layers/nn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6f5aeaa527..2354819869 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9338,6 +9338,7 @@ def get_tensor_from_selected_rows(x, name=None): def shuffle_channel(x, group, name=None): """ **Shuffle Channel Operator** + This operator shuffles the channels of input x. It divide the input channels in each group into :attr:`group` subgroups, and obtain a new order by selecting element from every subgroup one by one. @@ -9346,6 +9347,7 @@ def shuffle_channel(x, group, name=None): https://arxiv.org/pdf/1707.01083.pdf .. code-block:: text + Given a 4-D tensor input with the shape (N, C, H, W): input.shape = (1, 4, 2, 2) input.data =[[[[0.1, 0.2], @@ -9387,7 +9389,8 @@ def shuffle_channel(x, group, name=None): Examples: .. code-block:: python - input = fluid.layers.data(name='input', shape=[1,4,2,2], dtype='float32') + + input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32') out = fluid.layers.shuffle_channel(x=input, group=2) """ helper = LayerHelper("shuffle_channel", **locals()) From 312fe0ece16dc316904318b61f3dacaa8777eade Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 16 Jan 2019 08:27:59 +0000 Subject: [PATCH 032/123] add trt int8 calibration support fix comments test=develop --- paddle/fluid/inference/analysis/argument.h | 3 +- paddle/fluid/inference/analysis/helper.cc | 8 +++ paddle/fluid/inference/analysis/helper.h | 8 ++- .../inference/analysis/ir_pass_manager.cc | 10 ++-- .../ir_passes/tensorrt_subgraph_pass.cc | 37 +++++++++----- paddle/fluid/inference/api/analysis_config.cc | 12 ++--- .../fluid/inference/api/analysis_predictor.cc | 49 +++++++++++-------- .../fluid/inference/api/analysis_predictor.h | 13 +++++ .../inference/api/paddle_analysis_config.h | 8 ++- paddle/fluid/inference/tensorrt/engine.cc | 2 +- paddle/fluid/inference/tensorrt/engine.h | 6 +-- .../inference/tensorrt/trt_int8_calibrator.cc | 31 ++++++------ .../inference/tensorrt/trt_int8_calibrator.h | 28 +++++------ .../operators/tensorrt/tensorrt_engine_op.cc | 3 +- .../operators/tensorrt/tensorrt_engine_op.h | 38 +++++++------- 15 files changed, 158 insertions(+), 98 deletions(-) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index b06ff63a74..c317172fa2 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/platform/variant.h" namespace paddle { @@ -128,7 +129,7 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, - std::string); + contrib::AnalysisConfig::Precision); // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index ca40c01fc5..4f5c50d0d6 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -36,6 +36,14 @@ void SetAttr(framework::proto::OpDesc *op, const std::string &name, attr->set_i(data); } template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const bool &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(data); +} +template <> void SetAttr(framework::proto::OpDesc *op, const std::string &name, const int64_t &data) { auto *attr = op->add_attrs(); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 5df3aacc3f..40c94d9904 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -156,7 +156,7 @@ static bool PathExists(const std::string &path) { return false; } -static std::string SplitPath(const std::string path) { +static std::string GetDirRoot(const std::string path) { char sep = '/'; #ifdef _WIN32 @@ -167,10 +167,14 @@ static std::string SplitPath(const std::string path) { if (i != std::string::npos) { return (path.substr(0, i)); } - return path; } +static std::string GetTrtCalibPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_calib_" + engine_key; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index a996055774..f9ef0a68e9 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -71,13 +71,17 @@ void IRPassManager::CreatePasses(Argument *argument, "program", new framework::ProgramDesc *( const_cast(&argument->main_program()))); - pass->Set("precision_mode", - new std::string(argument->tensorrt_precision_mode())); + + bool enable_int8 = false; + if (argument->tensorrt_precision_mode() == + contrib::AnalysisConfig::Precision::kInt8) + enable_int8 = true; + + pass->Set("enable_int8", new bool(enable_int8)); pass->Set("model_dir", new std::string(argument->model_path())); } // graph_ = pass->Apply(std::move(graph_)); - pre_pass = pass_name; passes_.emplace_back(std::move(pass)); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 634c5ead0a..34991b6fbc 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -93,8 +94,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, } // collect inputs - std::unordered_set input_names; - std::unordered_set input_names_with_id; + std::set input_names; + std::set input_names_with_id; for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -102,8 +103,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, op_desc->SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); - std::unordered_set output_names; - std::unordered_set output_names_with_id; + std::set output_names; + std::set output_names_with_id; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -203,28 +204,40 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); op_desc->SetBlockAttr("sub_block", new_block); - // Set attrs SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); + // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); - std::string engine_key = std::to_string( - std::hash()(block_desc.Proto()->SerializeAsString())); - std::string precision_mode = Get("precision_mode"); + auto enable_int8 = Get("enable_int8"); SetAttr(op_desc->Proto(), "calibration_data", std::string("")); - std::string trt_calib_file = - Get("model_dir") + "/trt_calib_" + engine_key; - if (precision_mode == "INT8" && FileExists(trt_calib_file)) { + + // we use the subgraph's inputs and outputs to generate the engine key. + std::string engine_hash_key = ""; + for (auto name : input_names_with_id) { + engine_hash_key += name; + } + for (auto name : output_names_with_id) { + engine_hash_key += name; + } + + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + + auto trt_calib_file = + GetTrtCalibPath(Get("model_dir"), engine_key); + VLOG(3) << "engine key: " << engine_key; + if (enable_int8 && FileExists(trt_calib_file)) { + VLOG(3) << "Calibration table file: " << trt_calib_file << "is found here"; std::ifstream infile(trt_calib_file, std::ios::in); std::stringstream buffer; buffer << infile.rdbuf(); std::string calibration_data(buffer.str()); SetAttr(op_desc->Proto(), "calibration_data", calibration_data); } - SetAttr(op_desc->Proto(), "precision_mode", precision_mode); + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); } diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 399db291fd..7c7efe7a3d 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -122,13 +122,13 @@ void contrib::AnalysisConfig::EnableMKLDNN() { #endif } -void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size, - int min_subgraph_size, - std::string precision_mode) { +void contrib::AnalysisConfig::EnableTensorRtEngine( + int workspace_size, int max_batch_size, int min_subgraph_size, + contrib::AnalysisConfig::Precision precision_mode) { use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; + tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_precision_mode_ = precision_mode; Update(); } @@ -149,7 +149,7 @@ void contrib::AnalysisConfig::Update() { << "TensorRT engine is not available when EnableGpu() not actived."; } else { // Append after the infer_clean pass. - pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); + pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } } @@ -180,7 +180,7 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ss << use_tensorrt_; ss << tensorrt_workspace_size_; ss << tensorrt_max_batchsize_; - ss << tensorrt_precision_mode_; + ss << tensorrt_min_subgraph_size_; ss << use_mkldnn_; ss << enable_ir_optim_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 75c62bb98c..838016bd76 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -30,9 +30,9 @@ #include "paddle/fluid/inference/api/paddle_inference_pass.h" #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #endif #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" @@ -46,8 +46,8 @@ namespace paddle { using contrib::AnalysisConfig; using inference::Singleton; using inference::tensorrt::TRTInt8Calibrator; -using inference::tensorrt::TRTCalibratorRes; -using inference::tensorrt::TRTCalibratorResManager; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; namespace { bool IsPersistable(const framework::VarDesc *var) { @@ -334,7 +334,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file().empty()); - std::string dir = inference::analysis::SplitPath(config_.prog_file()); + std::string dir = inference::analysis::GetDirRoot(config_.prog_file()); argument_.SetModelPath(dir); argument_.SetModelProgramPath(config_.prog_file()); @@ -562,6 +562,7 @@ bool AnalysisPredictor::LoadParameters() { return true; } +#if PADDLE_WITH_TENSORRT bool AnalysisPredictor::SaveTrtCalibToDisk() { PADDLE_ENFORCE(config_.tensorrt_engine_enabled(), "This func can be invoked only in trt mode"); @@ -570,44 +571,50 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { if (op_desc->Type() == "tensorrt_engine") { std::string engine_name = boost::get(op_desc->GetAttr("engine_key")); - if (!Singleton::Global().Has(engine_name)) { + if (!Singleton::Global().Has(engine_name)) { LOG(ERROR) << "You should run the predictor(with trt) on the real data " "to generate calibration info"; return false; } - TRTCalibratorRes *calib_res = - Singleton::Global().Get(engine_name); + TRTCalibratorEngine *calib_engine = + Singleton::Global().Get(engine_name); LOG(INFO) << "Wait for calib threads done."; - calib_res->calib_->waitAndSetDone(); + calib_engine->calib_->waitAndSetDone(); LOG(INFO) << "Finish wait."; - calib_res->thr_->join(); - std::string calibration_data = - calib_res->calib_->getCalibrationTableAsString(); + calib_engine->thr_->join(); + std::string calibration_table_data = + calib_engine->calib_->getCalibrationTableAsString(); - if (calibration_data.size() == 0) { + if (calibration_table_data.empty()) { LOG(ERROR) << "the calibration table is empty."; return false; } - std::string calibration_data_path = - argument_.model_path() + "/trt_calib_" + engine_name; - std::ofstream ofile(calibration_data_path, std::ios::out); - LOG(INFO) << "Write Paddle-TRT INT8 calibration data to file " - << calibration_data_path; - ofile << calibration_data; + + std::string calibration_table_data_path = + inference::analysis::GetTrtCalibPath(argument_.model_path(), + engine_name); + + std::ofstream ofile(calibration_table_data_path, std::ios::out); + LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file " + << calibration_table_data_path; + ofile << calibration_table_data; ofile.close(); } } // Free all calibrator resources. - Singleton::Global().DeleteALL(); + Singleton::Global().DeleteALL(); return true; } +#endif AnalysisPredictor::~AnalysisPredictor() { +#if PADDLE_WITH_TENSORRT if (config_.tensorrt_engine_enabled() && - config_.tensorrt_precision_mode_ == "INT8" && - Singleton::Global().Has()) { + config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && + Singleton::Global().Has()) { SaveTrtCalibToDisk(); } +#endif if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index cec36a0d3a..c87987b167 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -91,7 +91,20 @@ class AnalysisPredictor : public PaddlePredictor { void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); +#if PADDLE_WITH_TENSORRT + // When we use Paddle-TRT INT8 engine, we need to generate calibration table + // data first, + // the calibration table contains the range for each op's input and output, + // this whole process can be divided into several steps: + // + // 1. Builds a 32-bit engine, runs it on the calibration set, and records a + // histogram for each + // tensor of the distribution of activation values. + // 2. Builds a calibration table from the histograms. + // + // After step 2, we need to store the calibration table on disk bool SaveTrtCalibToDisk(); +#endif ~AnalysisPredictor(); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 14b16d08b3..118af6f401 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -42,6 +42,10 @@ struct AnalysisConfig { explicit AnalysisConfig(const std::string& model_dir); explicit AnalysisConfig(const std::string& prog_file, const std::string& params_file); + enum class Precision { + kFloat32 = 0, + kInt8, + }; /** Set model with a directory. */ @@ -136,7 +140,7 @@ struct AnalysisConfig { */ void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3, - std::string precision = "FP32"); + Precision precision = Precision::kFloat32); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -232,7 +236,7 @@ struct AnalysisConfig { // We set this variable to control the minimum number of nodes in the // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; - std::string tensorrt_precision_mode_; + Precision tensorrt_precision_mode_; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 43f99df463..808e93d2ed 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -70,7 +70,7 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); - if (precision_mode_ == "INT8") { + if (enable_int8_) { infer_builder_->setInt8Mode(true); PADDLE_ENFORCE( calibrator_ != nullptr, diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 9aed374dce..788a4493c0 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -58,14 +58,14 @@ class TensorRTEngine : public EngineBase { TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream = nullptr, int device = 0, - std::string precision_mode = "FP32", + bool enable_int8 = "false", TRTInt8Calibrator* calibrator = nullptr, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), stream_(stream ? stream : &default_stream_), device_(device), - precision_mode_(precision_mode), + enable_int8_(enable_int8), calibrator_(calibrator), logger_(logger) { freshDeviceId(); @@ -168,7 +168,7 @@ class TensorRTEngine : public EngineBase { // The specific GPU id that the TensorRTEngine bounded to. int device_; - std::string precision_mode_; + bool enable_int8_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc index f935620020..4a85c8b8fe 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -25,11 +25,7 @@ int TRTInt8Calibrator::getBatchSize() const { return batch_size_; } TRTInt8Calibrator::TRTInt8Calibrator( const std::unordered_map& buffers, int batch_size, std::string engine_name, const platform::Place place) - : batch_size_(batch_size), - calib_running_(true), - data_is_set_(false), - done_(false), - engine_name_(engine_name) { + : batch_size_(batch_size), engine_name_(engine_name) { int i = 0; VLOG(4) << "Init a new calibrator: " << engine_name_; for (const auto it : buffers) { @@ -62,28 +58,32 @@ void TRTInt8Calibrator::waitAndSetDone() { } } +// There might be more than one input for trt subgraph, +// So, we use a map to store input information. bool TRTInt8Calibrator::setBatch( const std::unordered_map& data) { VLOG(3) << "set batch: " << engine_name_; std::unique_lock lk(mut_); + // There is a producer and a consumer. The producer set the batch data and + // the consumer get the batch data. The size of the data pool is one. + // So, the producer has to wait for the consumer to finish processing before + // they can set the data. while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk); + // The done_ is set to true using waitAndSetDone, When all calibration data + // are processed. if (done_) return false; // Sets the batch. - for (const auto it : data) { + for (const auto& it : data) { auto dataptr = data_buffers_.find(it.first); if (dataptr == data_buffers_.end()) { LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first << "' does not match with the buffer names"; } - const auto& d = dataptr->second; - auto status = - cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice); - if (status != cudaSuccess) { - LOG(FATAL) << "cudaMemcpy " << engine_name_ << " for '" << it.first - << "' failed with " << status; - } + PADDLE_ENFORCE( + cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), + "Fail to cudaMemcpy %s for %s", engine_name_, it.first); } data_is_set_ = true; @@ -95,9 +95,12 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, int num_bindings) { VLOG(4) << "get batch: " << engine_name_; std::unique_lock lk(mut_); + // The consumer has just finished processing a data. + // The producer can set the data again. calib_running_ = false; cond_.notify_all(); + // As long as there is data in the pool, the consumer can get it. while (!data_is_set_ && !done_) cond_.wait(lk); if (done_) return false; @@ -123,7 +126,7 @@ void TRTInt8Calibrator::setDone() { cond_.notify_all(); } -const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) { +const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) { if (calibration_table_.empty()) return nullptr; length = calibration_table_.size(); return calibration_table_.data(); diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h index 13f6e7ad01..919f5d55f8 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -21,8 +21,8 @@ #include #include -#include "NvInfer.h" -#include "cuda_runtime_api.h" +#include +#include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/place.h" @@ -60,9 +60,9 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { private: const int batch_size_; - bool calib_running_; - bool data_is_set_; - bool done_; + bool calib_running_{true}; + bool data_is_set_{false}; + bool done_{false}; std::mutex mut_; std::condition_variable cond_; @@ -74,9 +74,9 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { std::string calibration_table_; }; -class TRTCalibratorRes { +class TRTCalibratorEngine { public: - TRTCalibratorRes() {} + TRTCalibratorEngine() {} std::unique_ptr calib_; std::unique_ptr thr_; std::unique_ptr engine_; @@ -84,7 +84,7 @@ class TRTCalibratorRes { /* * Manager to control the TensorRT Int8 calibration creation and deltetion. */ -class TRTCalibratorResManager { +class TRTCalibratorEngineManager { public: bool Has() const { return res_.size() > 0; } bool Has(const std::string& name) const { @@ -93,22 +93,22 @@ class TRTCalibratorResManager { } // Get Int8Calibrator via name - TRTCalibratorRes* Get(const std::string& name) const { + TRTCalibratorEngine* Get(const std::string& name) const { return res_.at(name).get(); } // Look up or create a calibrator. - TRTCalibratorRes* LookupOrCreate(const std::string& engine_name) { + TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) { if (res_.count(engine_name) == 0) { - auto* p = new TRTCalibratorRes(); + auto* p = new TRTCalibratorEngine; res_[engine_name].reset(p); } return res_.at(engine_name).get(); } // Create an Int8Calibrator - TRTCalibratorRes* Create(const std::string& engine_name) { - auto* p = new TRTCalibratorRes(); + TRTCalibratorEngine* Create(const std::string& engine_name) { + auto* p = new TRTCalibratorEngine; res_[engine_name].reset(p); return p; } @@ -120,7 +120,7 @@ class TRTCalibratorResManager { } private: - std::unordered_map> res_; + std::unordered_map> res_; }; } // namespace tensorrt diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index ed177eb18f..031335009b 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -36,8 +36,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); AddAttr("sub_block", "the trt block"); - AddAttr("precision_mode", - "the precision mode: 'FP32', 'INT8' "); + AddAttr("enable_int8", "whether swith to int8 mode"); AddComment("TensorRT engine operator."); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 57747faec8..d27e013dc4 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -65,8 +65,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { using inference::Singleton; using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TRTInt8Calibrator; -using inference::tensorrt::TRTCalibratorRes; -using inference::tensorrt::TRTCalibratorResManager; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; class TensorRTEngineOp : public framework::OperatorBase { private: @@ -76,7 +76,7 @@ class TensorRTEngineOp : public framework::OperatorBase { int max_batch_size_; int workspace_size_; std::unique_ptr calibrator_; - std::string precision_mode_; + bool enable_int8_; std::string calibration_data_; std::string engine_key_; bool calibration_mode_; @@ -90,7 +90,7 @@ class TensorRTEngineOp : public framework::OperatorBase { input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); - precision_mode_ = Attr("precision_mode"); + enable_int8_ = Attr("enable_int8"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); @@ -98,17 +98,19 @@ class TensorRTEngineOp : public framework::OperatorBase { for (const auto ¶m : params) { param_names_.insert(param); } - calibration_mode_ = - (precision_mode_ == "INT8" && calibration_data_.size() == 0); + // calibration_mode is ture represents we need to + // generate the calibration table data. + calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0); - if (precision_mode_ == "INT8" && calibration_data_.size()) { + VLOG(4) << "calibration_mode: " << calibration_mode_; + if (enable_int8_ && calibration_data_.size()) { calibrator_.reset(new TRTInt8Calibrator(calibration_data_)); } } protected: - void RunNative(const framework::Scope &scope, - const platform::Place &dev_place) const { + void RunNativeImpl(const framework::Scope &scope, + const platform::Place &dev_place) const { framework::Executor executor(dev_place); auto *block = Attr("sub_block"); auto *program = block->Program(); @@ -128,12 +130,14 @@ class TensorRTEngineOp : public framework::OperatorBase { void RunCalibration(const framework::Scope &scope, const platform::Place &dev_place) const { - // Create calibrator here. + // This process will builds a 32-bit trt engine, runs it on the calibration + // set, and records a histogram for each + // tensor of the distribution of activation values. LOG(INFO) << "Running calibration trt int8 ..."; int runtime_batch = 1; - if (!Singleton::Global().Has(engine_key_)) { - TRTCalibratorRes *calib_res = - Singleton::Global().Create(engine_key_); + if (!Singleton::Global().Has(engine_key_)) { + TRTCalibratorEngine *calib_res = + Singleton::Global().Create(engine_key_); std::unordered_map calib_buffers; for (auto &x : input_names_) { if (param_names_.count(x)) continue; @@ -148,7 +152,7 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( max_batch_size_, workspace_size_, nullptr, - boost::get(dev_place).device, precision_mode_, + boost::get(dev_place).device, enable_int8_, calib_res->calib_.get())); VLOG(3) << "start the calib trt engine thread"; Prepare(scope, dev_place, calib_res->engine_.get()); @@ -156,7 +160,7 @@ class TensorRTEngineOp : public framework::OperatorBase { } TRTInt8Calibrator *temp_calibrator = - Singleton::Global() + Singleton::Global() .Get(engine_key_) ->calib_.get(); std::unordered_map calib_data; @@ -168,7 +172,7 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_data.emplace(x, t.data()); } temp_calibrator->setBatch(calib_data); - RunNative(scope, dev_place); + RunNativeImpl(scope, dev_place); } void RunTrt(const framework::Scope &scope, @@ -178,7 +182,7 @@ class TensorRTEngineOp : public framework::OperatorBase { trt_engine_.reset( new TensorRTEngine(max_batch_size_, workspace_size_, nullptr, boost::get(dev_place).device, - precision_mode_, calibrator_.get())); + enable_int8_, calibrator_.get())); Prepare(scope, dev_place, trt_engine_.get()); } From b95f2ff8fe06dcfe0143c05c055c1fa199079f8d Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 16 Jan 2019 09:06:26 +0000 Subject: [PATCH 033/123] fix win build bug test=develop --- paddle/fluid/inference/api/analysis_predictor.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 838016bd76..0988682c6a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -45,9 +45,11 @@ namespace paddle { using contrib::AnalysisConfig; using inference::Singleton; +#if PADDLE_WITH_TENSORRT using inference::tensorrt::TRTInt8Calibrator; using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngineManager; +#endif namespace { bool IsPersistable(const framework::VarDesc *var) { From 8817841c73f7929e9d25e6f310900f6c0c6290b6 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 17 Jan 2019 05:42:53 +0000 Subject: [PATCH 034/123] fix unit test bug test=develop --- paddle/fluid/inference/tensorrt/engine.h | 2 +- .../tensorrt/tensorrt_engine_op_test.cc | 50 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 788a4493c0..4e8eabce47 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -58,7 +58,7 @@ class TensorRTEngine : public EngineBase { TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream = nullptr, int device = 0, - bool enable_int8 = "false", + bool enable_int8 = false, TRTInt8Calibrator* calibrator = nullptr, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 287b0edc96..212c50820c 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetInput("Xs", std::vector({"x"})); engine_op_desc.SetOutput("Ys", std::vector({"z0"})); - SetAttr(engine_op_desc.Proto(), "subgraph", - block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); - SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); - SetAttr>(engine_op_desc.Proto(), "parameters", - std::vector({})); - SetAttr>(engine_op_desc.Proto(), - "output_name_mapping", - std::vector({"z0"})); + + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(2)); + engine_op_desc.SetAttr("workspace_size", static_cast(2 << 10)); + engine_op_desc.SetAttr("parameters", std::vector({})); + engine_op_desc.SetAttr("engine_key", std::string("a_engine")); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z0"})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); LOG(INFO) << "create engine op"; - auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); LOG(INFO) << "engine_op " << engine_op.get(); framework::Scope scope; @@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetInput("Xs", std::vector({"x0"})); engine_op_desc.SetOutput("Ys", std::vector({"z3"})); - SetAttr(engine_op_desc.Proto(), "subgraph", - block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); - SetAttr>( - engine_op_desc.Proto(), "parameters", - std::vector({"y0", "y1", "y2", "y3"})); - SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); - - SetAttr>(engine_op_desc.Proto(), - "output_name_mapping", - std::vector({"z3"})); - - auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(batch_size)); + engine_op_desc.SetAttr("workspace_size", static_cast(2 << 10)); + engine_op_desc.SetAttr("parameters", + std::vector({"y0", "y1", "y2", "y3"})); + engine_op_desc.SetAttr("engine_key", std::string("b_engine")); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z3"})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); // Execute them. engine_op->Run(scope, place); From 579d7582549f4c886dcf26ad3ba0b4de3bb6b7f9 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 17 Jan 2019 12:41:52 +0000 Subject: [PATCH 035/123] fix jitkernel tests and refine benchmark test=develop --- paddle/fluid/operators/jit/benchmark.cc | 130 ++++++++++++++++++------ paddle/fluid/operators/jit/test.cc | 30 ++++-- 2 files changed, 117 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index b39ce28093..e7041b1228 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -26,6 +26,49 @@ DEFINE_int32(burning, 10, "Burning times."); DEFINE_int32(repeat, 3000, "Repeat times."); DEFINE_int32(max_size, 1000, "The Max size would be tested."); +DEFINE_string(filter, "", "The Benchmark name would be run."); + +class BenchJITKernel { + public: + BenchJITKernel() = default; + virtual ~BenchJITKernel() = default; + virtual void Run() = 0; + virtual const char* Name() = 0; + virtual const char* Dtype() = 0; + virtual const char* Place() = 0; +}; + +static std::vector g_all_benchmarks; + +BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { + g_all_benchmarks.push_back(b); + return b; +} + +#define BENCH_JITKERNEL(name, dtype, place) \ + class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \ + public: \ + const char* Name() override { return #name; } \ + const char* Dtype() override { return #dtype; } \ + const char* Place() override { return #place; } \ + void Run() override; \ + }; \ + static auto inserted_##name##_##dtype##_##place##_ = \ + InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ + void BenchJITKernel_##name##_##dtype##_##place##_::Run() + +#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU) + +void RUN_ALL_BENCHMARK() { + for (auto p : g_all_benchmarks) { + if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) { + continue; + } + LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "." + << p->Place(); + p->Run(); + } +} template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), @@ -228,49 +271,70 @@ void BenchMatMulKernel() { } } +using T = float; +using PlaceType = paddle::platform::CPUPlace; + +// xyzn +BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } + +BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } + +BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } + +BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } + +// axyn +BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } + +BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } + +// xyn +BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } + +BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } + +// lstm and peephole +BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } + +BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } + +// gru functions +BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } + +BENCH_FP32_CPU(kGRUHtPart1) { + BenchGRUKernel(); +} + +BENCH_FP32_CPU(kGRUHtPart2) { + BenchGRUKernel(); +} + +// seq pool function +BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } + +// matmul +BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } + // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] // Options: // --burning: the burning time before count // --repeat: the repeat times // --max_size: the max size would be tested +// --filter: the bench name would be run int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, true); google::InitGoogleLogging(argv[0]); LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat << " times."; - using T = float; - using PlaceType = paddle::platform::CPUPlace; - // xyzn - BenchXYZNKernel(); - BenchXYZNKernel(); - BenchXYZNKernel(); - BenchXYZNKernel(); - - // axyn - BenchAXYNKernel(); - BenchAXYNKernel(); - - // xyn - BenchXYNKernel(); - BenchXYNKernel(); - BenchXYNKernel(); - BenchXYNKernel(); - BenchXYNKernel(); - BenchXYNKernel(); - - // lstm and peephole - BenchLSTMKernel(); - BenchLSTMKernel(); - - // gru functions - BenchGRUKernel(); - BenchGRUKernel(); - BenchGRUKernel(); - - // seq pool function - BenchSeqPoolKernel(); - // matmul - BenchMatMulKernel(); + RUN_ALL_BENCHMARK(); } diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index f4415a54ca..68a79b6314 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -22,6 +22,8 @@ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" +static double acc = 1e-5; + template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), const T upper = static_cast(20.f)) { @@ -37,7 +39,7 @@ template void ExpectEQ(const T* target, const T* refer, int n) { if (std::is_floating_point::value) { for (int i = 0; i < n; ++i) { - EXPECT_NEAR(target[i], refer[i], 1e-5); + EXPECT_NEAR(target[i], refer[i], acc); } } else { for (int i = 0; i < n; ++i) { @@ -62,7 +64,9 @@ namespace jit = paddle::operators::jit; template struct TestFuncWithRefer { - void operator()(const typename KernelTuples::func_type tgt, Args... args) {} + void operator()(const typename KernelTuples::func_type tgt, Args... args) { + LOG(FATAL) << "Should specify this function."; + } }; template @@ -140,7 +144,8 @@ struct TestFuncWithRefer, std::vector, std::vector> { template struct TestFuncWithRefer, std::vector, std::vector, - std::vector, std::vector, std::vector> { + std::vector, std::vector, std::vector, + typename jit::LSTMTuples::attr_type> { void operator()(const typename jit::LSTMTuples::func_type tgt, const std::vector& xsrc, const std::vector& wp, const std::vector& ct_1, const std::vector& ct_ref, @@ -185,7 +190,8 @@ struct TestFuncWithRefer, std::vector, std::vector, template struct TestFuncWithRefer, std::vector, std::vector, - std::vector> { + std::vector, + typename jit::GRUTuples::attr_type> { void operator()(const typename jit::GRUTuples::func_type tgt, const std::vector& xsrc, const std::vector& ht_1, const std::vector& ht_ref, @@ -212,8 +218,8 @@ struct TestFuncWithRefer, std::vector, std::vector, }; template -struct TestFuncWithRefer, std::vector, - std::vector> { +struct TestFuncWithRefer, std::vector, std::vector, + typename jit::SeqPoolTuples::attr_type> { void operator()(const typename jit::SeqPoolTuples::func_type tgt, const std::vector& x, const std::vector& yref, const typename jit::SeqPoolTuples::attr_type& attr) { @@ -385,8 +391,8 @@ void TestLSTMKernel() { std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); RandomVec(4 * d, xsrc.data(), -2.f, 2.f); - RandomVec(3 * d, wp.data(), -2.f, 2.f); - RandomVec(d, ct_1.data(), -2.f, 2.f); + RandomVec(3 * d, wp.data(), -1.f, 1.f); + RandomVec(d, ct_1.data(), -1.f, 1.f); // x could be changed after compute, so copy to save src std::vector x(xsrc.size()); std::copy(xsrc.begin(), xsrc.end(), x.begin()); @@ -481,14 +487,17 @@ void TestSeqPoolKernel() { template void TestMatMulKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + auto last_acc = acc; + // TODO(intel): this should be acc issue of MKL + acc = 1e-3; for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); - RandomVec(m * k, a.data(), -0.2f, 0.2f); - RandomVec(k * n, b.data(), -0.2f, 0.2f); + RandomVec(m * k, a.data(), -2.f, 2.f); + RandomVec(k * n, b.data(), -2.f, 2.f); const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.data(); @@ -498,6 +507,7 @@ void TestMatMulKernel() { } } } + acc = last_acc; } template From 316e44b1b7d674b9b6ac0a8bbfa1725a75d2fbda Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 18 Jan 2019 13:55:02 +0000 Subject: [PATCH 036/123] fix unused warnings test=develop --- paddle/fluid/operators/jit/benchmark.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index e7041b1228..74d6a87247 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/variant.h" // for UNUSED DEFINE_int32(burning, 10, "Burning times."); DEFINE_int32(repeat, 3000, "Repeat times."); @@ -53,7 +54,7 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { const char* Place() override { return #place; } \ void Run() override; \ }; \ - static auto inserted_##name##_##dtype##_##place##_ = \ + static auto inserted_##name##_##dtype##_##place##_ UNUSED = \ InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ void BenchJITKernel_##name##_##dtype##_##place##_::Run() From 451896fce4d48306737883c73a837ecda3f691d7 Mon Sep 17 00:00:00 2001 From: WangZhen Date: Sat, 19 Jan 2019 02:58:25 +0800 Subject: [PATCH 037/123] init quantization. --- .../fluid/framework/details/build_strategy.cc | 1 + paddle/fluid/framework/ir/pass.cc | 4 + paddle/fluid/pybind/ir.cc | 6 + paddle/fluid/pybind/protobuf.cc | 6 + paddle/fluid/pybind/pybind.cc | 14 +- .../paddle/fluid/contrib/slim/graph/graph.py | 79 ++++- .../quantization/quantization_performer.py | 287 ++++++++++++++++++ 7 files changed, 391 insertions(+), 6 deletions(-) create mode 100644 python/paddle/fluid/contrib/slim/quantization/quantization_performer.py diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index df0ff772c9..ad73085f52 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -233,3 +233,4 @@ USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(lock_free_optimize_pass); +USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 6cf405efe6..33ccee6aa0 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -28,10 +28,14 @@ std::unique_ptr Pass::Apply(std::unique_ptr graph) const { PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.", attr); } + auto* native_graph = graph.get(); auto applied_graph = ApplyImpl(std::move(graph)); // TODO(panyx0718): Add more verifications. PADDLE_ENFORCE(!HasCircle(*applied_graph), "Illegal Pass. Generated graph shouldn't has cycle."); + PADDLE_ENFORCE(applied_graph.get() == native_graph, + "Pass::Apply() cannot delete the passed graph and shouldn't " + "return a new graph.(For the need of pybind11)"); applied_ = true; return applied_graph; } diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index d32fe58f86..1205ccf7f0 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -42,6 +42,7 @@ void BindGraph(py::module *m) { .def("get_float", &Graph::Get) .def("get_double", &Graph::Get) .def("get_string", &Graph::Get) + .def("get_program", &Graph::Get) .def("set", [](Graph &self, const std::string &attr_name, int attr) { return self.Set(attr_name, new int(attr)); }) .def("set", @@ -57,6 +58,11 @@ void BindGraph(py::module *m) { [](Graph &self, const std::string &attr_name, double attr) { return self.Set(attr_name, new double(attr)); }) + .def("set", + [](Graph &self, const std::string &attr_name, + const ProgramDesc &attr) { + return self.Set(attr_name, new ProgramDesc(attr)); + }) .def("erase", &Graph::Erase) .def("nodes", &Graph::Nodes, return_value_policy::reference) .def("create_var_node", diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 4b218fb3a2..09c08f1ffc 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -229,6 +229,12 @@ void BindBlockDesc(pybind11::module *m) { void BindVarDsec(pybind11::module *m) { pybind11::class_ var_desc(*m, "VarDesc", ""); var_desc + .def("__init__", + [](pd::VarDesc &self, const pybind11::bytes &binary_str) { + std::string str(binary_str); + new (&self) pd::VarDesc(str); + }, + pybind11::return_value_policy::reference) .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference) .def("set_name", &pd::VarDesc::SetName) .def("set_shape", &pd::VarDesc::SetShape) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f3f4854a9e..ae50f3885f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -786,9 +786,20 @@ All parameter, weight, gradient are variables in Paddle. m.def("disable_profiler", platform::DisableProfiler); m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); + m.def("get_pass", [](const py::bytes &binary_str) { + std::string pass_type(binary_str); + auto pass = framework::ir::PassRegistry::Instance().Get(pass_type); + return std::shared_ptr(std::move(pass)); + }); py::class_> pass(m, "Pass"); pass.def(py::init()) + .def("has", &ir::Pass::Has) + .def("set_program", + [](ir::Pass &self, const std::string &attr_name, + const ProgramDesc &attr) { + return self.Set(attr_name, new ProgramDesc(attr)); + }) .def( "set_str", [](ir::Pass &self, const std::string &name, const std::string &attr) { @@ -796,11 +807,12 @@ All parameter, weight, gradient are variables in Paddle. }) .def("set_int", [](ir::Pass &self, const std::string &name, int val) { self.Set(name, new int(val)); }) + .def("get_program", &ir::Pass::Get) .def("type", &ir::Pass::Type) .def("apply", [](ir::Pass &self, std::shared_ptr graph) { std::unique_ptr origin_graph(graph.get()); auto optim_graph = self.Apply(std::move(origin_graph)); - graph.reset(optim_graph.release()); + optim_graph.release(); }); py::class_> pb( diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py index 7d6b070203..774da2d1ef 100644 --- a/python/paddle/fluid/contrib/slim/graph/graph.py +++ b/python/paddle/fluid/contrib/slim/graph/graph.py @@ -13,8 +13,81 @@ # limitations under the License. from ....framework import Program +from ....framework import Block +from .... import core -__all__ = ['Graph', 'ImitationGraph', 'IRGraph'] +__all__ = ['Graph', 'ImitationGraph', 'PyGraph'] + + +class PyGraph(object): + """ + PyGraph uses core.Graph as the delegation to accomplish the manipulation. + """ + + def __init__(self, graph): + assert isinstance( + graph, core.Graph), 'graph must be the instance of core.Graph.' + self.graph = graph + + def all_parameters(self): + params = [] + for node in self.graph.nodes(): + if node.is_var() and node.var().persistable(): + params.append(node) + return params + + def all_vars(self): + return [node for node in self.graph.nodes() if node.is_var()] + + def all_ops(self): + return [node for node in self.graph.nodes() if node.is_op()] + + def create_param_node(self, name, var_type, shape, var_dtype): + var_desc = core.VarDesc(name) + var_desc.set_type(var_type) + var_desc.set_shape(shape) + var_desc.set_dtype(var_dtype) + var_desc.set_persistable(True) + return self.graph.create_var_node(var_desc) + + def create_var_node(self, name, var_type, shape, var_dtype): + var_desc = core.VarDesc(name) + var_desc.set_type(var_type) + var_desc.set_shape(shape) + var_desc.set_dtype(var_dtype) + return self.graph.create_var_node(var_desc) + + def create_var_node_from_desc(self, var_desc): + return self.graph.create_var_node(var_desc) + + def create_op_node(self, op_type, attrs, inputs, outputs): + op_desc = core.OpDesc() + op_desc.set_type(op_type) + for attr, value in attrs.iteritems(): + self._update_desc_attr(op_desc, attr, value) + for input_name, var_node in inputs.iteritems(): + op_desc.set_input(input_name, [var_node.name()]) + for output_name, var_node in outputs.iteritems(): + op_desc.set_output(output_name, [var_node.name()]) + return self.graph.create_op_node(op_desc) + + def create_op_node_from_desc(self, op_desc): + return self.graph.create_op_node(op_desc) + + def _update_desc_attr(self, desc, name, val): + """ + Update the value of desc's attribute by attribute's name. + """ + if isinstance(val, Block): + desc.set_block_attr(name, val.desc) + elif isinstance(val, list) and val and all( + isinstance(v, Block) for v in val): + desc.set_blocks_attr(name, [v.desc for v in val]) + elif isinstance(val, core.BlockDesc) or \ + isinstance(val, core.ProgramDesc): + desc.set_serialized_attr(name, val.serialize_to_string()) + else: + desc._set_attr(name, val) class Graph(object): @@ -39,7 +112,3 @@ class ImitationGraph(Graph): def all_parameters(self): return self.program.global_block().all_parameters() - - -class IRGraph(Graph): - pass diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py b/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py new file mode 100644 index 0000000000..7d9207dfbc --- /dev/null +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py @@ -0,0 +1,287 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np +from .... import core +from ....initializer import Constant +from .... import unique_name +from ..graph import PyGraph + + +class QuantizationPerformer(object): + def __init__(self, + weight_bits=8, + activation_bits=8, + activation_quantize_type='abs_max', + weight_quantize_type='abs_max', + window_size=10000): + """ + Convert and rewrite the IRGraph according to weight and + activation quantization type. + Args: + weight_bits (int): quantization bit number for weights, + the bias is not quantized. + activation_bits (int): quantization bit number for activation. + activation_quantize_type (str): quantization type for activation, + now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode, + the quantization scale will be calculated dynamically each step + in both training and testing period. If use 'range_abs_max', + a static quantization scale will be calculated during training + and used in inference. + weight_quantize_type (str): quantization type for weights, + support 'abs_max'. The 'range_abs_max' usually is not used for + weight, since weights are fixed once the model is well trained. + window_size (int): the window size for 'range_abs_max' quantization. + Examples: + .. code-block:: python + # the original graph will be rewrite, if you don't want to + # change it, please clone at first. + # graph = graph.clone() + from paddle.fluid.contrib.slim import * + from paddle.fluid.contrib.quantize import * + graph = IRGraph(program) + performer = QuantizationPerformer() + performer.quantize_transform(graph) + """ + self.weight_bits = weight_bits + self.activation_bits = activation_bits + + quant_type = ['abs_max', 'range_abs_max'] + if activation_quantize_type not in quant_type: + raise ValueError( + "Unknown activation_quantize_type : '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) + if weight_quantize_type not in quant_type: + raise ValueError( + "Unknown weight_quantize_type: '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) + + self.activation_quantize_type = activation_quantize_type + self.weight_quantize_type = weight_quantize_type + self.window_size = window_size + + self.need_inited_outer = collections.OrderedDict() + self.quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] + self.quantizable_grad_ops = [ + '%s_grad' % (op) for op in self.quantizable_ops + ] + self.fake_quant_op_types = [ + 'fake_quantize_abs_max', 'fake_quantize_range_abs_max' + ] + self.fake_dequant_op_types = ['fake_dequantize_max_abs'] + self.is_test = None + self.global_step = None + + def quantize_transform(self, graph, is_test): + self.need_inited_outer.clear() + self.is_test = is_test + assert isinstance(graph, + PyGraph), 'graph must be the instance of PyGraph.' + # marked the variable which has been dequantized. + dequantized_vars = collections.OrderedDict() + params = [p.name() for p in graph.all_parameters()] + + def _transform_forward(graph, op): + for var_node in op.inputs: + if var_node.name() in dequantized_vars: + dequant_var_node = dequantized_vars[var_node.name()] + else: + quant_bits = self.weight_bits if var_node.name() in params \ + else self.activation_bits + quant_type = self.weight_quantize_type if var_node.name() \ + in params else self.activation_quantize_type + quant_var_node, scale_var_node = self._insert_quant_op( + graph, var_node, quant_bits, quant_type) + dequant_var_node = self._insert_dequant_op( + graph, quant_var_node, scale_var_node, quant_bits) + dequantized_vars[var_node.name()] = dequant_var_node + self._update_input(var_node, dequant_var_node, op) + + if not self.is_test: + self._create_global_step(graph) + ops = graph.all_ops() + for op in ops: + # transform the forward graph + if op.name() in self.quantizable_ops: + _transform_forward(graph, op) + # rename the inputs of backward op + if op.name() in self.quantizable_grad_ops: + _transform_backward(graph, op) + return self.need_inited_outer + + def _insert_quant_op(self, graph, var_node, quant_bits, quant_type): + """ + Insert fake_quantize_op in the graph. + """ + if quant_type == 'abs_max': + return self._insert_quant_abs_max_op(graph, var_node, quant_bits) + elif quant_type == 'range_abs_max': + return self._inser_quant_range_abs_max_op(graph, var_node, + quant_bits) + + def _insert_quant_abs_max_op(self, graph, var_node, quant_bits): + """ + Insert fake_quantize_abs_max op in the graph. + """ + assert var_node.is_var(), '{} is not a var'.format(var_node.name()) + + quant_var_node = graph.create_var_node( + name=self._quantized_var_name(var_node.name()), + var_type=var_node.var().type(), + shape=var_node.var().shape(), + var_dtype=var_node.var().dtype()) + scale_var_node = graph.create_var_node( + name=self._quantized_scale_name(var_node.name()), + var_type=var_node.var().type(), + shape=var_node.var().shape(), + var_dtype=var_node.var().dtype()) + quant_op_node = graph.create_op_node( + op_type='fake_quantize_abs_max', + attrs={'bit_length': quant_bits}, + inputs={'X': var_node}, + outputs={'Out': quant_var_node, + 'OutScale': scale_var_node}) + self._link_to(var_node, quant_op_node) + self._link_to(quant_op_node, quant_var_node) + self._link_to(quant_op_node, scale_var_node) + return quant_var_node, scale_var_node + + def _insert_quant_range_abs_max_op(self, graph, var_node, quant_bits): + """ + Insert fake_quantize_range_abs_max on the graph. + """ + assert var_node.is_var(), '{} is not a var'.format(var_node.name()) + + quant_var_node = graph.create_var_node( + name=self._quantized_var_name(var_node.name()), + var_type=var_node.var().type(), + shape=var_node.var().shape(), + var_dtype=var_node.var().dtype()) + + scale_in_node = graph.create_param_node( + name=self._quantized_scale_name(var_node.name()), + var_type=core.VarDesc.VarType.LOD_TENSOR, + shape=[1], + var_dtype=var_node.var().dtype()) + self.need_inited_outer[scale_in_node.var()] = Constant(value=0.001) + + scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) + inputs = {'X': var_node, 'InScale': scale_in_node} + outputs = {'Out': quant_var_node, 'OutScale': scale_out_node} + + if not self.is_test: + # The name of scales_var_node maybe 'scales_0', 'scales_1', etc. + scales_node = graph.create_param_node( + name=unique_name.generate('scales'), + var_type=core.VarDesc.VarType.LOD_TENSOR, + shape=[self.window_size], + var_dtype=var_node.var().dtype()) + self.need_inited_outer[scales_node.var()] = Constant(value=0) + inputs['Iter'] = self.global_step + outputs['OutScales'] = scales_node + attrs = { + 'window_size': self.window_size, + 'bit_length': quant_bits, + 'is_test': self.is_test + } + quant_op_node = graph.create_op_node( + op_type='fake_quantize_range_abs_max', + attrs=attrs, + inputs=inputs, + outputs=outputs) + + self._link_to(var_node, quant_op_node) + self._link_to(scale_in_node, quant_op_node) + self._link_to(quant_op_node, quant_var_node) + self._link_to(quant_op_node, scale_out_node) + + if not self.is_test: + self._link_to(self.global_step, quant_op_node) + self._link_to(quant_op_node, scales_node) + + return quant_var_node, scale_out_node + + def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits): + """ + Insert fake_dequantize_op in the graph. + """ + assert var_node.is_var(), '{} is not a var'.format(var_node.name()) + + dequant_var_node = graph.create_var_node( + name=self._dequantized_var_name(var_node.name()), + var_type=var_node.var().type(), + shape=var_node.var().shape(), + var_dtype=var_node.var().dtype()) + max_range = (1 << (quant_bits - 1)) - 1 + dequant_op_node = graph.create_op_node( + op_type='fake_dequantize_max_abs', + attrs={'max_range': float(max_range)}, + inputs={'X': var_node, + 'Scale': scale_var_node}, + outputs={'Out': dequant_var_node}) + self._link_to(var_node, dequant_op_node) + self._link_to(scale_var_node, dequant_op_node) + self._link_to(dequant_op_node, dequant_var_node) + return dequant_var_node + + def _update_input(self, old_input_node, new_input_node, op_node): + old_input_node.outputs.remove(op_node) + op_node.inputs.remove(old_input_node) + new_input_node.outputs.append(op_node) + op_node.inputs.append(new_input_node) + + def _link_to(node_in, node_out): + node_in.outputs.append(node_out) + node_out.inputs.append(node_in) + + def _quantized_var_name(self, var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.quantized" % (var_name) + + def _dequantized_var_name(self, var_name): + """ + Return dequantized variable name for the input `var_name`. + """ + return "%s.dequantized" % (var_name) + + def _quantized_scale_name(self, var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.scale" % (var_name) + + def _original_var_name(self, var_name): + """ + Return the original variable name. + """ + if var_name.endswith('.quantized.dequantized'): + return var_name[:-len('.quantized.dequantized')] + if var_name.endswith('.quantized'): + return var_name[:-len('.quantized')] + if var_name.endswith('.dequantized'): + return var_name[:-len('.dequantized')] + if var_name.endswith('.scale'): + return var_name[:-len('.scale')] + else: + return var_name + + def _is_float(self, v): + return isinstance(v, float) or isinstance(v, np.float32) + + def _quant(self, x, scale, num_bits): + y = np.round(x / scale * ((1 << (num_bits - 1)) - 1)) + return y From e2ff300b02feda77473c8703954b990828a3a10e Mon Sep 17 00:00:00 2001 From: WangZhen Date: Sun, 20 Jan 2019 15:24:45 +0800 Subject: [PATCH 038/123] add UT for quantization. --- .../fluid/framework/details/build_strategy.cc | 1 + paddle/fluid/pybind/ir.cc | 55 ++++++- paddle/fluid/pybind/protobuf.cc | 8 +- .../paddle/fluid/contrib/slim/graph/graph.py | 80 +++++++++-- .../contrib/slim/quantization/__init__.py | 20 +++ .../quantization/quantization_performer.py | 69 +++++++-- .../unitest/test_quantization_performer.py | 135 ++++++++++++++++++ python/setup.py.in | 1 + 8 files changed, 336 insertions(+), 33 deletions(-) create mode 100644 python/paddle/fluid/contrib/slim/quantization/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index ad73085f52..e69d67dc55 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" namespace paddle { diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index 1205ccf7f0..24059140ab 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -15,7 +15,9 @@ #include "paddle/fluid/pybind/ir.h" #include #include +#include #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/var_desc.h" @@ -24,6 +26,7 @@ namespace py = pybind11; using paddle::framework::ir::Graph; using paddle::framework::ir::Node; +using paddle::framework::ir::GraphSafeRemoveNodes; using paddle::framework::OpDesc; using paddle::framework::ProgramDesc; using paddle::framework::VarDesc; @@ -32,6 +35,7 @@ using pybind11::return_value_policy; namespace paddle { namespace pybind { void BindGraph(py::module *m) { + m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes); py::class_>( *m, "Graph", "The graph is a Directed Acyclic Single Static Assignment Graph, see " @@ -43,6 +47,7 @@ void BindGraph(py::module *m) { .def("get_double", &Graph::Get) .def("get_string", &Graph::Get) .def("get_program", &Graph::Get) + .def("get_marked_nodes", &Graph::Get>) .def("set", [](Graph &self, const std::string &attr_name, int attr) { return self.Set(attr_name, new int(attr)); }) .def("set", @@ -63,6 +68,12 @@ void BindGraph(py::module *m) { const ProgramDesc &attr) { return self.Set(attr_name, new ProgramDesc(attr)); }) + .def("set", + [](Graph &self, const std::string &attr_name, + const std::unordered_set &attr) { + return self.Set(attr_name, + new std::unordered_set(attr)); + }) .def("erase", &Graph::Erase) .def("nodes", &Graph::Nodes, return_value_policy::reference) .def("create_var_node", @@ -91,12 +102,52 @@ void BindNode(py::module *m) { py::class_ node(*m, "Node"); node.def("name", &Node::Name) .def("node_type", &Node::NodeType) - .def("var", &Node::Var) - .def("op", &Node::Op) + .def("var", &Node::Var, return_value_policy::reference) + .def("op", &Node::Op, return_value_policy::reference) .def("id", &Node::id) .def("is_op", &Node::IsOp) .def("is_var", &Node::IsVar) .def("is_ctrl_var", &Node::IsCtrlVar) + .def("inputs_remove", + [](Node &self, int node_id) { + for (auto it = self.inputs.begin(); it != self.inputs.end(); + it++) { + if ((*it)->id() == node_id) { + self.inputs.erase(it); + } + } + }) + .def("inputs_remove", + [](Node &self, Node &node) { + for (auto it = self.inputs.begin(); it != self.inputs.end(); + it++) { + if (*it == &node) { + self.inputs.erase(it); + } + } + }) + .def("inputs_append", + [](Node &self, Node &node) { self.inputs.push_back(&node); }) + .def("outputs_remove", + [](Node &self, int node_id) { + for (auto it = self.outputs.begin(); it != self.outputs.end(); + it++) { + if ((*it)->id() == node_id) { + self.outputs.erase(it); + } + } + }) + .def("outputs_remove", + [](Node &self, Node &node) { + for (auto it = self.outputs.begin(); it != self.outputs.end(); + it++) { + if (*it == &node) { + self.outputs.erase(it); + } + } + }) + .def("outputs_append", + [](Node &self, Node &node) { self.outputs.push_back(&node); }) .def_readwrite("inputs", &Node::inputs) .def_readwrite("outputs", &Node::outputs); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 09c08f1ffc..e729be4a95 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -228,13 +228,7 @@ void BindBlockDesc(pybind11::module *m) { void BindVarDsec(pybind11::module *m) { pybind11::class_ var_desc(*m, "VarDesc", ""); - var_desc - .def("__init__", - [](pd::VarDesc &self, const pybind11::bytes &binary_str) { - std::string str(binary_str); - new (&self) pd::VarDesc(str); - }, - pybind11::return_value_policy::reference) + var_desc.def(pybind11::init()) .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference) .def("set_name", &pd::VarDesc::SetName) .def("set_shape", &pd::VarDesc::SetShape) diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py index 774da2d1ef..61f9f950c4 100644 --- a/python/paddle/fluid/contrib/slim/graph/graph.py +++ b/python/paddle/fluid/contrib/slim/graph/graph.py @@ -11,12 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +from __future__ import print_function +import os +import subprocess from ....framework import Program from ....framework import Block from .... import core -__all__ = ['Graph', 'ImitationGraph', 'PyGraph'] +__all__ = ['Graph', 'ImitationGraph', 'IRGraph', 'PyGraph'] class PyGraph(object): @@ -30,17 +32,18 @@ class PyGraph(object): self.graph = graph def all_parameters(self): - params = [] + param_nodes = set() for node in self.graph.nodes(): - if node.is_var() and node.var().persistable(): - params.append(node) - return params + if node.is_var() and node.var() is not None and node.var( + ).persistable(): + param_nodes.add(node) + return param_nodes def all_vars(self): - return [node for node in self.graph.nodes() if node.is_var()] + return {node for node in self.graph.nodes() if node.is_var()} def all_ops(self): - return [node for node in self.graph.nodes() if node.is_op()] + return {node for node in self.graph.nodes() if node.is_op()} def create_param_node(self, name, var_type, shape, var_dtype): var_desc = core.VarDesc(name) @@ -65,10 +68,16 @@ class PyGraph(object): op_desc.set_type(op_type) for attr, value in attrs.iteritems(): self._update_desc_attr(op_desc, attr, value) - for input_name, var_node in inputs.iteritems(): - op_desc.set_input(input_name, [var_node.name()]) - for output_name, var_node in outputs.iteritems(): - op_desc.set_output(output_name, [var_node.name()]) + for input_name, var_nodes in inputs.iteritems(): + if not isinstance(var_nodes, list): + var_nodes = [var_nodes] + op_desc.set_input(input_name, + [var_node.name() for var_node in var_nodes]) + for output_name, var_nodes in outputs.iteritems(): + if not isinstance(var_nodes, list): + var_nodes = [var_nodes] + op_desc.set_output(output_name, + [var_node.name() for var_node in var_nodes]) return self.graph.create_op_node(op_desc) def create_op_node_from_desc(self, op_desc): @@ -89,6 +98,49 @@ class PyGraph(object): else: desc._set_attr(name, val) + def safe_remove_nodes(self, remove_nodes): + if not isinstance(remove_nodes, set): + remove_nodes = set(remove_nodes) + core.graph_safe_remove_nodes(self.graph, remove_nodes) + + def draw_graph(self, save_path, name, marked_nodes=None): + def _convert_to_pdf(dot_file_path): + pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf' + exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \ + + ' -o ' + pdf_save_path, shell=True) + if exited_code != 0: + print('The dot command is needed for creating pdf files.') + print('The {} is saved as the dot filetype.'.format( + dot_file_path)) + + remove_ctr_vars = set() + ops_num = 0 + for node in self.graph.nodes(): + if node.is_ctrl_var(): + remove_ctr_vars.add(node) + elif node.is_op(): + ops_num += 1 + print('Total ops num = {}.'.format(ops_num)) + self.safe_remove_nodes(remove_ctr_vars) + if marked_nodes is not None: + if not isinstance(marked_nodes, set): + marked_nodes = set(marked_nodes) + marked_nodes = marked_nodes - remove_ctr_vars + self.graph.set('__graphviz__marked_node__', marked_nodes) + viz_dot_path = os.path.join(save_path, name) + '.dot' + viz_pass = core.get_pass('graph_viz_pass') + viz_pass.set_str('graph_viz_path', viz_dot_path) + viz_pass.apply(self.graph) + _convert_to_pdf(viz_dot_path) + + def to_program(self): + convert_pass = core.get_pass('graph_to_program_pass') + convert_pass.set_program('program', Program().desc) + convert_pass.apply(self.graph) + program = Program() + program.desc = convert_pass.get_program('program') + return program + class Graph(object): """ @@ -112,3 +164,7 @@ class ImitationGraph(Graph): def all_parameters(self): return self.program.global_block().all_parameters() + + +class IRGraph(Graph): + pass diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py new file mode 100644 index 0000000000..f522385417 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import quantization_performer +from .quantization_performer import * + +__all__ = quantization_performer.__all__ diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py b/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py index 7d9207dfbc..ac84b763a6 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py @@ -19,6 +19,8 @@ from ....initializer import Constant from .... import unique_name from ..graph import PyGraph +__all__ = ['QuantizationPerformer'] + class QuantizationPerformer(object): def __init__(self, @@ -108,19 +110,62 @@ class QuantizationPerformer(object): graph, quant_var_node, scale_var_node, quant_bits) dequantized_vars[var_node.name()] = dequant_var_node self._update_input(var_node, dequant_var_node, op) + op.op()._rename_input(var_node.name(), dequant_var_node.name()) + + def _transform_backward(graph, op): + no_dequanted_input_vars = True + for var_node in op.inputs: + if var_node.name() in dequantized_vars: + dequant_var_node = dequantized_vars[var_node.name()] + self._update_input(var_node, dequant_var_node, op) + op.op()._rename_input(var_node.name(), + dequant_var_node.name()) + no_dequanted_input_vars = False + if no_dequanted_input_vars: + raise ValueError("There is no dequanted inputs for op %s." % + (op.name())) if not self.is_test: self._create_global_step(graph) ops = graph.all_ops() + # The process of _transform_forward and _transform_backward is needed in two for loops. + # The loop for transforming the forward graph: for op in ops: - # transform the forward graph if op.name() in self.quantizable_ops: _transform_forward(graph, op) - # rename the inputs of backward op + # The loop for renaming the inputs of backward op. + for op in ops: if op.name() in self.quantizable_grad_ops: _transform_backward(graph, op) + return self.need_inited_outer + def _create_global_step(self, graph): + if self.weight_quantize_type == 'range_abs_max' or \ + self.activation_quantize_type == 'range_abs_max': + counter_name = '@STEP_COUNTER@' + for node in graph.all_vars(): + if node.name() == counter_name: + self.global_step = node + if self.global_step is None: + global_step_in = graph.create_param_node( + name=counter_name, + var_type=core.VarDesc.VarType.LOD_TENSOR, + shape=[1], + var_dtype=core.VarDesc.VarType.INT64) + self.need_inited_outer[global_step_in.var()] = \ + Constant(value=0, force_cpu=True) + global_step_out = graph.create_var_node_from_desc( + global_step_in.var()) + increment_op = graph.create_op_node( + op_type='increment', + attrs={'step': 1.0}, + inputs={'X': global_step_in}, + outputs={'Out': global_step_out}) + self._link_to(global_step_in, increment_op) + self._link_to(increment_op, global_step_out) + self.global_step = global_step_out + def _insert_quant_op(self, graph, var_node, quant_bits, quant_type): """ Insert fake_quantize_op in the graph. @@ -128,8 +173,8 @@ class QuantizationPerformer(object): if quant_type == 'abs_max': return self._insert_quant_abs_max_op(graph, var_node, quant_bits) elif quant_type == 'range_abs_max': - return self._inser_quant_range_abs_max_op(graph, var_node, - quant_bits) + return self._insert_quant_range_abs_max_op(graph, var_node, + quant_bits) def _insert_quant_abs_max_op(self, graph, var_node, quant_bits): """ @@ -237,14 +282,14 @@ class QuantizationPerformer(object): return dequant_var_node def _update_input(self, old_input_node, new_input_node, op_node): - old_input_node.outputs.remove(op_node) - op_node.inputs.remove(old_input_node) - new_input_node.outputs.append(op_node) - op_node.inputs.append(new_input_node) - - def _link_to(node_in, node_out): - node_in.outputs.append(node_out) - node_out.inputs.append(node_in) + old_input_node.outputs_remove(op_node) + op_node.inputs_remove(old_input_node) + new_input_node.outputs_append(op_node) + op_node.inputs_append(new_input_node) + + def _link_to(self, node_in, node_out): + node_in.outputs_append(node_out) + node_out.inputs_append(node_in) def _quantized_var_name(self, var_name): """ diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py new file mode 100644 index 0000000000..771d880a28 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py @@ -0,0 +1,135 @@ +# copyright (c) 2018 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +import unittest +import random +import numpy as np +import paddle +import paddle.fluid as fluid +import six +from paddle.fluid.framework import Program +from paddle.fluid.contrib.slim.quantization import QuantizationPerformer +from paddle.fluid.contrib.slim.graph import PyGraph +from paddle.fluid import core + + +def linear_fc(num): + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + hidden = fluid.layers.fc(hidden, size=128, act='relu') + loss = fluid.layers.cross_entropy(input=hidden, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def residual_block(num): + def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) + short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) + hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') + fc = fluid.layers.fc(input=hidden, size=10) + loss = fluid.layers.cross_entropy(input=fc, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestQuantizationPerformer(unittest.TestCase): + def setUp(self): + # since quant_op and dequant_op is not ready, use cos and sin for test + self.weight_quant_op_type = 'fake_quantize_abs_max' + self.dequant_op_type = 'fake_dequantize_max_abs' + self.quantizable_op_and_inputs = { + 'conv2d': ['Input', 'Filter'], + 'depthwise_conv2d': ['Input', 'Filter'], + 'mul': ['X', 'Y'] + } + self.quantizable_op_grad_and_inputs = { + 'conv2d_grad': ['Input', 'Filter'], + 'depthwise_conv2d_grad': ['Input', 'Filter'], + 'mul_grad': ['X', 'Y'] + } + + def linear_fc_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = linear_fc(3) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + graph = PyGraph(core.Graph(main.desc)) + performer = QuantizationPerformer(activation_quantize_type=quant_type) + performer.quantize_transform(graph, False) + marked_nodes = set() + for op in graph.all_ops(): + if op.name().find('quantize') > -1: + marked_nodes.add(op) + graph.draw_graph('.', 'quantize_fc_' + quant_type, marked_nodes) + + def test_linear_fc_quant_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.linear_fc_quant('abs_max') + + def test_linear_fc_quant_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.linear_fc_quant('range_abs_max') + + def residual_block_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = residual_block(2) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + graph = PyGraph(core.Graph(main.desc)) + performer = QuantizationPerformer(activation_quantize_type=quant_type) + performer.quantize_transform(graph, False) + marked_nodes = set() + for op in graph.all_ops(): + if op.name().find('quantize') > -1: + marked_nodes.add(op) + graph.draw_graph('.', 'quantize_residual_' + quant_type, marked_nodes) + + def test_residual_block_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.residual_block_quant('abs_max') + + def test_residual_block_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.residual_block_quant('range_abs_max') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index c9afe6c885..e41bd4d377 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -113,6 +113,7 @@ packages=['paddle', 'paddle.fluid.contrib.slim.core', 'paddle.fluid.contrib.slim.graph', 'paddle.fluid.contrib.slim.prune', + 'paddle.fluid.contrib.slim.quantization', 'paddle.fluid.contrib.utils', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details'] From 5e5e6a32251ddd67defcf301262c85d6ec00efbb Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 21 Jan 2019 14:26:18 +0800 Subject: [PATCH 039/123] fix the prompt when dll load failed on windows test=develop --- python/paddle/fluid/framework.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 569ca2a4f7..77239f2e8e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -37,11 +37,13 @@ try: from . import core except ImportError as e: if os.name == 'nt': + executable_path = os.path.abspath(os.path.dirname(sys.executable)) raise ImportError( - """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\" - if you encounters \"mkldnn.dll not found\" errors. If you have python - installed in other directory, replace \"c:\python27\lib" with your own - directory. The original error is: \n""" + cpt.get_exception_message(e)) + """NOTE: You may need to run \"set PATH=%s;%%PATH%%\" + if you encounters \"DLL load failed\" errors. If you have python + installed in other directory, replace \"%s\" with your own + directory. The original error is: \n %s""" % + (executable_path, executable_path, cpt.get_exception_message(e))) else: raise ImportError( """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\" From 885c4e57abdace2e769697b4b464edbfa62b19e6 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Mon, 21 Jan 2019 14:31:02 +0800 Subject: [PATCH 040/123] fea/infer memory optim2 (#14953) --- paddle/fluid/framework/ir/fc_fuse_pass.cc | 1 + paddle/fluid/framework/ir/graph_helper.cc | 143 +++- paddle/fluid/framework/ir/graph_helper.h | 17 + .../framework/ir/graph_to_program_pass.cc | 31 +- .../framework/ir/graph_to_program_pass.h | 4 + paddle/fluid/framework/ir/graph_viz_pass.cc | 2 +- paddle/fluid/framework/ir/node.h | 2 +- paddle/fluid/framework/naive_executor.cc | 7 +- .../fluid/inference/analysis/CMakeLists.txt | 1 + paddle/fluid/inference/analysis/analyzer.cc | 17 +- paddle/fluid/inference/analysis/analyzer.h | 2 +- .../inference/analysis/analyzer_tester.cc | 4 + paddle/fluid/inference/analysis/argument.h | 11 + paddle/fluid/inference/analysis/helper.h | 7 + .../inference/analysis/ir_pass_manager.cc | 1 + .../analysis/ir_passes/CMakeLists.txt | 2 +- .../analysis/ir_passes/subgraph_detector.cc | 1 - .../ir_passes/tensorrt_subgraph_pass.cc | 4 + .../inference/analysis/passes/CMakeLists.txt | 13 +- .../passes/ir_analysis_compose_pass.cc | 62 -- .../analysis/passes/ir_analysis_pass.cc | 14 +- .../analysis/passes/ir_analysis_pass.h | 3 + .../passes/ir_graph_to_program_pass.cc | 45 ++ ...pose_pass.h => ir_graph_to_program_pass.h} | 20 +- .../analysis/passes/memory_optimize_pass.cc | 647 ++++++++++++++++++ .../analysis/passes/memory_optimize_pass.h | 106 +++ .../fluid/inference/analysis/passes/passes.cc | 13 +- paddle/fluid/inference/api/CMakeLists.txt | 11 +- paddle/fluid/inference/api/analysis_config.cc | 102 ++- .../fluid/inference/api/analysis_predictor.cc | 92 ++- .../fluid/inference/api/analysis_predictor.h | 10 + .../api/analysis_predictor_tester.cc | 51 ++ paddle/fluid/inference/api/demo_ci/run.sh | 1 + paddle/fluid/inference/api/helper.h | 15 +- .../inference/api/paddle_analysis_config.h | 11 + .../inference/api/paddle_pass_builder.cc | 5 + .../fluid/inference/api/paddle_pass_builder.h | 46 +- .../fluid/inference/tests/api/CMakeLists.txt | 4 +- .../tests/api/analyzer_dam_tester.cc | 30 + .../analyzer_text_classification_tester.cc | 2 + .../tests/api/analyzer_vis_tester.cc | 9 +- .../fluid/inference/tests/api/tester_helper.h | 6 +- .../inference/tests/api/trt_models_tester.cc | 5 + paddle/fluid/inference/utils/benchmark.h | 2 +- .../fluid/inference/utils/benchmark_tester.cc | 4 +- paddle/fluid/operators/controlflow/feed_op.cc | 1 + paddle/fluid/string/pretty_log.h | 17 + 47 files changed, 1450 insertions(+), 154 deletions(-) delete mode 100644 paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc rename paddle/fluid/inference/analysis/passes/{ir_analysis_compose_pass.h => ir_graph_to_program_pass.h} (59%) create mode 100644 paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/memory_optimize_pass.h diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 26eac93905..12b31da010 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include #include +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index d99f856d8f..8de93cf285 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -18,8 +18,10 @@ limitations under the License. */ #include #include #include +#include #include #include +#include "paddle/fluid/framework/ir/graph_traits.h" DEFINE_string(print_sub_graph_dir, "", "FLAGS_print_sub_graph_dir is used " @@ -41,7 +43,7 @@ void SortHelper( } } - VLOG(3) << "topology sort insert: " << node->Name() + VLOG(5) << "topology sort insert: " << node->Name() << " " << reinterpret_cast(node) << " input " << node->inputs.size(); ret->push_back(node); } @@ -99,12 +101,13 @@ std::vector TopologySortOperations(const Graph &graph) { return ret; } +// Build operator inlink edge table. std::map> BuildOperationAdjList( const Graph &graph) { std::map> adj_list; for (auto &n : graph.Nodes()) { - if (n->NodeType() != ir::Node::Type::kOperation) continue; + if (!n->IsOp()) continue; if (adj_list.find(n) == adj_list.end()) { adj_list[n] = std::unordered_set(); } @@ -121,6 +124,119 @@ std::map> BuildOperationAdjList( return adj_list; } +// Build operator outlink edge table. +std::map> BuildOperationOutAdjList( + const Graph &graph) { + std::map> adj_list; + + for (auto &n : graph.Nodes()) { + if (!n->IsOp()) continue; + if (adj_list.find(n) == adj_list.end()) { + adj_list[n] = std::unordered_set(); + } + for (auto &var : n->outputs) { + for (auto &adj_n : var->outputs) { + PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); + VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) + << " -> " << n->Name() << reinterpret_cast(n) + << " via " << var->Name() << reinterpret_cast(var); + adj_list[n].insert(adj_n); + } + } + } + return adj_list; +} + +std::vector OpDFSSort(const Graph &graph) { + auto edge_table = BuildOperationOutAdjList(graph); + std::stack stack; + for (auto &ele : edge_table) { + if (ele.first->inputs.empty()) { + // find the input ops (those without input vars) + stack.push(ele.first); + } else { + // find the ops with only persistable vars as inputs. + bool all_persistable = true; + for (auto *input : ele.first->inputs) { + if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) { + all_persistable = false; + } + } + if (all_persistable) { + stack.push(ele.first); + } + } + } + + std::vector res; + // start from the feed op and DFS + std::unordered_set unique_set; + while (!stack.empty()) { + // will start from the last feed by default. + auto cur = stack.top(); + stack.pop(); + unique_set.insert(cur); + res.push_back(cur); + + for (auto *op : edge_table[cur]) { + if (!unique_set.count(op)) { + stack.push(op); + } + } + } + return res; +} + +std::vector TopologyDfsSortOperations(const Graph &graph) { + std::vector nodes; + std::unordered_map in_degree; + + auto set_out_ops_ready = [&](Node *var) { + for (auto *op : var->outputs) { + --in_degree[op]; + } + }; + // build in_degree + for (auto *node : graph.Nodes()) { + if (node->IsOp()) { + in_degree[node] += node->inputs.size(); + } else if (node->IsVar() && node->inputs.empty()) { + // put all the inputs of the whole graph ready. + set_out_ops_ready(node); + } + } + + std::deque op_queue; + // first visit + for (auto &node : OpDFSSort(graph)) { + if (node->IsOp()) { + op_queue.push_back(node); + } + } + + // traverse the graph + int num_ops = op_queue.size(); + while (num_ops) { + for (auto it = op_queue.begin(); it != op_queue.end(); it++) { + auto *&cur_op = *it; + if (!cur_op || in_degree[cur_op] > 0) continue; + // visit this node + // put all the output var of this op valid. + for (auto *out_var : cur_op->outputs) { + if (!out_var) continue; + set_out_ops_ready(out_var); + } + VLOG(8) << "visit " << cur_op->Name(); + nodes.push_back(cur_op); + + cur_op = nullptr; + num_ops--; + } + } + + return nodes; +} + size_t GraphNum(const Graph &graph) { std::unordered_set nodes(graph.Nodes()); std::unordered_set visited_nodes; @@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) { return graph_count; } +void CleanIndividualNodes(Graph *graph) { + std::unordered_set nodes2rm; + for (auto *node : graph->Nodes()) { + if (node->inputs.empty() && node->outputs.empty()) { + nodes2rm.insert(node); + } + } + + for (auto *node : nodes2rm) { + graph->RemoveNode(node); + } +} + +std::vector TopologyVarientSort(const Graph &graph, + SortKind sort_kind) { + switch (sort_kind) { + case SortKind::TS: + return framework::ir::TopologySortOperations(graph); + default: + return framework::ir::TopologyDfsSortOperations(graph); + } +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index be525151f9..fba4936f2c 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph); // `graph` cannot contain circle. std::vector TopologySortOperations(const Graph &graph); +// Topological sort, but try to DFS. +std::vector TopologyDfsSortOperations(const Graph &graph); + +// Different kinds to sort the operators in a graph to a sequence. +enum class SortKind { + // Topological Search + TS = 0, + // Topological and Depth First Search + TDFS +}; + +// Several kinds of topological sort. +std::vector TopologyVarientSort(const Graph &graph, SortKind sort_kind); + +// Clean the nodes that doesn't connect to others. +void CleanIndividualNodes(Graph *graph); + // Build an adjacency list of operations for the `graph`. std::map> BuildOperationAdjList( const Graph &graph); diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc index 36f3693326..3372dcd181 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.cc +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" - #include "paddle/fluid/framework/program_desc.h" namespace paddle { @@ -29,6 +28,14 @@ namespace ir { std::unique_ptr GraphToProgramPass::ApplyImpl( std::unique_ptr graph) const { + // Remove the unneeded variables after memory optimization. + std::unordered_set vars2remove; + if (graph->Has(kGraphToProgramVarsToRemove)) { + vars2remove = graph->Get>( + kGraphToProgramVarsToRemove); + VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes"; + } + ProgramDesc& program = Get("program"); std::unique_ptr program_pb( @@ -40,25 +47,35 @@ std::unique_ptr GraphToProgramPass::ApplyImpl( std::unordered_set visited_vars; for (ir::Node* n : graph->Nodes()) { if (n->IsVar()) { - if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) { + if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 && + !vars2remove.count(n->Var()->Name())) { visited_vars.insert(n->Var()->Name()); block->add_vars()->MergeFrom(*n->Var()->Proto()); } } } - block->clear_ops(); - std::vector nodes = TopologySortOperations(*graph); + + std::vector nodes; + if (Has(kGraphToProgramSortKind)) { + // Inference Memory Optimize relays on this branch. + int sort_kind = Get(kGraphToProgramSortKind); + nodes = TopologyVarientSort( + *graph, static_cast(sort_kind)); + } else { + nodes = TopologySortOperations(*graph); + } + for (ir::Node* n : nodes) { - if (!n->Op()) { - continue; - } + if (!n->Op()) continue; + block->add_ops()->MergeFrom(*n->Op()->Proto()); } program.CopyFrom(*program_pb); return graph; } + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h index 124ec5a8e7..4c36c3a5da 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.h +++ b/paddle/fluid/framework/ir/graph_to_program_pass.h @@ -20,6 +20,10 @@ namespace paddle { namespace framework { namespace ir { +const char kGraphToProgramVarsToRemove[] = + "__graph_to_program_vars_to_remove__"; +const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__"; + class GraphToProgramPass : public Pass { protected: std::unique_ptr ApplyImpl(std::unique_ptr graph) const override; diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 31ed98db72..87a28a2a66 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -135,4 +135,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( } // namespace paddle REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) - .RequirePassAttr(paddle::framework::ir::kGraphVizPath); + .RequirePassAttr(paddle::framework::ir::kGraphVizPath); \ No newline at end of file diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 89dcc677b5..9eade9eaa8 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -64,7 +64,7 @@ class Node { std::string Name() const { return name_; } - VarDesc* Var() { + VarDesc* Var() const { PADDLE_ENFORCE(IsVar()); return var_desc_.get(); } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 86e6b1f7d9..a37bb6f4da 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -50,8 +50,8 @@ void NaiveExecutor::Run() { "running Paddle Inference"; #endif // PADDLE_ON_INFERENCE for (auto &op : ops_) { - VLOG(3) << std::this_thread::get_id() << " run " << op->Type() - << " on scope " << scope_; + VLOG(4) << std::this_thread::get_id() << " run " + << op->DebugStringEx(scope_) << " on scope " << scope_; op->SetIsCalledByExecutor(false); op->Run(*scope_, place_); } @@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id, anc = anc->parent(); } + int num_vars = 0; for (auto &var : global_block.AllVars()) { if (var->Name() == framework::kEmptyVarName) { continue; } + num_vars++; if (persistable == var->Persistable()) { if (persistable) { @@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id, } } } + VLOG(4) << "naive executor create " << num_vars << " vars"; } void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 27b6b80955..7a795bda82 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -18,6 +18,7 @@ cc_library(analysis SRCS analyzer.cc analysis_pass DEPS ${analysis_deps} analysis_helper + ${INFER_IR_PASSES} ) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index c8ed373ee7..d82a063d88 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -15,8 +15,8 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include #include -#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h" #include "paddle/fluid/inference/analysis/passes/passes.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { @@ -24,13 +24,16 @@ namespace analysis { Analyzer::Analyzer() {} -void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); } +void Analyzer::Run(Argument *argument) { RunAnalysis(argument); } -void Analyzer::RunIrAnalysis(Argument *argument) { - std::vector passes({"ir_analysis_compose_pass"}); - - for (auto &pass : passes) { - PassRegistry::Global().Retreive(pass)->Run(argument); +void Analyzer::RunAnalysis(Argument *argument) { + PADDLE_ENFORCE(argument->analysis_passes_valid(), + "analsis_passes is not valid in the argument."); + for (auto &pass : argument->analysis_passes()) { + string::PrettyLogH1("--- Running analysis [%s]", pass); + auto *ptr = PassRegistry::Global().Retreive(pass); + PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass); + ptr->Run(argument); } } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index b43e67f20f..a6de18db60 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -54,7 +54,7 @@ class Analyzer final { DISABLE_COPY_AND_ASSIGN(Analyzer); protected: - void RunIrAnalysis(Argument* argument); + void RunAnalysis(Argument* argument); }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 4c84d02d86..c814ce4548 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) { argument.SetModelDir(FLAGS_inference_model_dir); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); argument.SetUseGPU(false); + argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass"}); Analyzer analyser; analyser.Run(&argument); @@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) { argument.SetModelDir(FLAGS_inference_model_dir); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); argument.SetUseGPU(false); + argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass"}); Analyzer analyser; analyser.Run(&argument); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2d8980b1d1..88ce61f9b9 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -110,16 +110,20 @@ struct Argument { // The overall Scope to work on. DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope); + // The default program, loaded from disk. DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc); // The ir passes to perform in analysis phase. DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, std::vector); + DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses, + std::vector); // Pass a set of op types to enable its mkldnn kernel DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, std::unordered_set); + // Passed from config. DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); @@ -127,6 +131,13 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); + // Memory optimized related. + DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); + DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool); + // Indicate which kind of sort algorithm is used for operators, the memory + // optimization relays on the sort algorithm. + DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); + // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, framework::proto::ProgramDesc); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 269a0da9f9..de04713b53 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -28,6 +28,13 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/port.h" +#ifdef _WIN32 +#define GCC_ATTRIBUTE(attr__) ; +#else +#define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); +#endif +#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) + namespace paddle { namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index e37fea38bc..4e14642264 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -83,6 +83,7 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { PADDLE_ENFORCE(graph.get()); // Apply all the passes for (const auto &pass : passes_) { + if (pass->Type() == "graph_viz_pass") continue; PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); graph = pass->Apply(std::move(graph)); } diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index 9ae5b8aa17..eb6e1768a2 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) -if (TENSORRT_FOUND) +if (WITH_GPU AND TENSORRT_FOUND) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) set(analysis_deps ${analysis_deps} diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc index b6a5dfd087..a64f85ee9a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc @@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() { auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)(); for (auto &subgraph : subgraphs) { if (subgraph.size() <= (size_t)min_subgraph_size_) continue; - LOG(INFO) << "detect a subgraph size " << subgraph.size(); std::unordered_set subgraph_uniq(subgraph.begin(), subgraph.end()); // replace this sub-graph with the first node. Two steps: 1. Create a Block // Node that contains this subgraph 2. Mark the nodes inside the sub-graph diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index bc06e78ae6..5f25303cc1 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { @@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, framework::BlockDesc block_desc(nullptr, &block_proto); block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_idx(0); + string::PrettyLogDetail("--- detect a sub-graph with %d nodes", + subgraph.size()); + for (auto *node : subgraph) { auto *op = block_desc.AppendOp(); *op->Proto() = *node->Op()->Proto(); diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index add9b70f2c..691c336ebe 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -1,11 +1,18 @@ cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass) +cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) + +cc_library(analysis_passes SRCS passes.cc DEPS + ir_graph_build_pass + ir_analysis_pass + ir_params_sync_among_devices_pass + memory_optim_pass + ir_graph_to_program_pass +) set(analysis_deps ${analysis_deps} - ir_graph_build_pass - ir_analysis_pass analysis_passes subgraph_detector CACHE INTERNAL "") diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc deleted file mode 100644 index 490189e550..0000000000 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h" -#include -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ir_pass_manager.h" -#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" -#include "paddle/fluid/string/pretty_log.h" - -namespace paddle { -namespace inference { -namespace analysis { - -void IrAnalysisComposePass::RunImpl(Argument *argument) { - ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); - ApplyIrPasses(argument); - CollectFusionStatis(argument); -} - -std::string IrAnalysisComposePass::repr() const { - return "ir-analysis-compose-pass"; -} - -void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { - std::vector passes({ - "ir_graph_build_pass", "ir_analysis_pass", - "ir_params_sync_among_devices_pass", - }); - for (const auto &pass : passes) { - VLOG(2) << "Run pass " << pass; - auto *the_pass = PassRegistry::Global().Retreive(pass); - the_pass->Run(argument); - } -} - -void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) { - if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) { - LOG(INFO) << "argument has no fuse statis"; - return; - } - argument->SetFusionStatis( - argument->main_graph().Get( - framework::ir::kFuseStatisAttr)); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc index e327bd39f0..d986811a82 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h" namespace paddle { @@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) { IRPassManager the_ir_manager(argument); graph = the_ir_manager.Apply(std::move(graph)); PADDLE_ENFORCE_GT(graph->Nodes().size(), 0); - argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc( - the_ir_manager.AcquireProgram(&graph, argument->main_program()))); argument->SetMainGraph(graph.release()); + CollectFusionStatis(argument); +} + +void IrAnalysisPass::CollectFusionStatis(Argument* argument) { + if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) { + LOG(INFO) << "argument has no fuse statis"; + return; + } + argument->SetFusionStatis( + argument->main_graph().Get( + framework::ir::kFuseStatisAttr)); } std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; } diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h index d8a7449807..2c2113c06d 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h @@ -29,6 +29,9 @@ namespace analysis { class IrAnalysisPass : public AnalysisPass { public: void RunImpl(Argument* argument) override; + + void CollectFusionStatis(Argument* argument); + std::string repr() const override; }; diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc new file mode 100644 index 0000000000..f1da37af3c --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrGraphToProgramPass::RunImpl(Argument *argument) { + auto pass = + framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); + + if (argument->memory_optim_sort_kind_valid()) { + pass->Set(framework::ir::kGraphToProgramSortKind, + new int(argument->memory_optim_sort_kind())); + } + + std::unique_ptr graph(argument->main_graph_ptr()); + framework::ProgramDesc desc(argument->main_program()); + pass->SetNotOwned("program", &desc); + auto thegraph = pass->Apply(std::move(graph)); + thegraph.release(); // the argument still own the graph. + + argument->SetIrAnalyzedProgram( + new framework::proto::ProgramDesc(*desc.Proto())); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h similarity index 59% rename from paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h rename to paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h index 16c6b7d84d..838ebdbc9d 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h @@ -14,31 +14,17 @@ #pragma once -#include -#include #include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/passes/passes.h" namespace paddle { namespace inference { namespace analysis { -/* - * The analysis pass to run a list of IR passes (like a function call). - * Currently, it should be the first pass of analysis phase. - */ -class IrAnalysisComposePass : public AnalysisPass { +class IrGraphToProgramPass : public AnalysisPass { public: - void RunImpl(Argument* argument) override; - std::string repr() const override; + void RunImpl(Argument *argument) override; - private: - void ApplyIrPasses(Argument* argument); - - void CollectFusionStatis(Argument* argument); - - // Assign a Scope for IR passes to modify the weights. - void AssignScopeToModify(Argument* argument); + std::string repr() const override { return "ir-graph-to-param-pass"; } }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc new file mode 100644 index 0000000000..57683c0b72 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -0,0 +1,647 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Graph; +using framework::ir::Node; +using framework::ir::TopologyVarientSort; +using space_table_t = MemoryOptimizePass::space_table_t; + +// Collect the lifecycles of the tensors. +// Traverse the graph in topological order. +// The traversal order also affect the lifecycles, so different sort_kind is +// used. +void MemoryOptimizePass::CollectLifeCycle( + std::unordered_map* lifecycles, + int sort_kind) const { + max_lifecycle_ = 0; + for (auto* op_node : framework::ir::TopologyVarientSort( + *graph_, static_cast(sort_kind))) { + if (!op_node->IsOp()) continue; + auto reads = op_node->inputs; + auto writes = op_node->outputs; + + std::vector requires(reads.begin(), reads.end()); + requires.insert(requires.end(), writes.begin(), writes.end()); + + // Disable reuse of feed variables. + if (op_node->Name() == "feed") { + for (auto* node : op_node->outputs) { + auto var = node->Name(); + lifecycles->emplace(var, + std::make_pair(0, std::numeric_limits::max())); + } + } else { + // Normal operators. + for (const Node* node : requires) { + if (node->Var()->Persistable()) continue; + std::string var = node->Name(); + if (!lifecycles->count(var)) { + (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_); + } else { + (*lifecycles)[var].second = + std::max(max_lifecycle_, lifecycles->at(var).second); // max() + } + } + } + + ++max_lifecycle_; + } +} + +// TODO(Superjomn) Make this a general help method. +int DataTypeToSpace(framework::proto::VarType_Type type) { + switch (type) { + case framework::proto::VarType_Type_BOOL: + return sizeof(bool); + case framework::proto::VarType_Type_FP32: + return sizeof(float); + case framework::proto::VarType_Type_INT32: + return sizeof(int32_t); + case framework::proto::VarType_Type_INT64: + return sizeof(int64_t); + default: + PADDLE_THROW("Unknown data type"); + } +} + +// Collect the memory size of the tensors. +void MemoryOptimizePass::CollectVarMemorySize( + const std::unordered_map& batch_var_ave_dim, + std::unordered_map* tensor_nodes, + space_table_t* space_table) const { + // Collect tensors from graph. + for (auto* node : graph_->Nodes()) { + if (node->IsVar() && + node->Var()->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + // Parameters will not be reused. + if (node->Var()->Persistable()) continue; + (*tensor_nodes)[node->Name()] = node; + (*space_table)[node->Name()] = + DataTypeToSpace(node->Var()->GetDataType()) * + batch_var_ave_dim.at(node->Name()); + } + } +} + +// Find a sutable (big enough but smallest to avoid memory waste). +// +// Args: +// @tensor_nodes: the tensor nodes in the ir::Graph. +// @free_existing_tensors: the allocated tensor and are free. +// @space_table: the memory space of tensors. +// @tensor2use: the tensor that requires memory. +// +// Returns: +// true if found some existing tensor to reuse. +// false if no sutable tensor to reuse, one need to allocate a new tensor for +// this requirement. +// The suitable tensor for reuse is one that is approximately equal to the +// memory demand. +bool FindSuitableTensorToReuse( + const std::string& tensor, int space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + const space_table_t& space_table, + const std::vector>& var_clusters, + std::string* tensor2use) __SHOULD_USE_RESULT__; + +bool FindSuitableTensorToReuse( + const std::string& tensor, int space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + const space_table_t& space_table, + const std::vector>& var_clusters, + std::string* tensor2use) { + std::pair best_fit; + best_fit.second = std::numeric_limits::max(); + VLOG(5) << "Split Tensors to " << var_clusters.size() << " clusters"; + + // find the cluster this var belongs to. + const std::unordered_set* cluster = nullptr; + for (const auto& c : var_clusters) { + if (c.count(tensor)) { + cluster = &c; + break; + } + } + PADDLE_ENFORCE_NOT_NULL(cluster, + "something wrong in memory optimization, the " + "variable %s not in the clusters.", + tensor); + + for (auto& candidate : *free_existing_tensors) { + // This is not a temporary tensor. + if (!space_table.count(candidate)) continue; + // Not in the same cluster. + if (!cluster->count(candidate)) continue; + + size_t space = space_table.at(candidate); + size_t space_diff = std::abs(space - space_required); + if (space_diff < best_fit.second) { + best_fit.first = candidate; + best_fit.second = space_diff; + } + } + + if (best_fit.second < std::numeric_limits::max()) { + *tensor2use = best_fit.first; + return true; + } + return false; +} + +// Allocate new tensor instead of reusing the existing one. +void AllocateNewTensor( + const std::string& name, size_t space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + space_table_t* space_table, + std::unordered_map* reuse_table) { + // The newly born tensor is free to be used. + free_existing_tensors->insert(name); + // Register the space it has. + PADDLE_ENFORCE(space_table->count(name)); + space_table->at(name) = std::max(space_table->at(name), space_required); + // The allocated new tensor use the memory of itself. + (*reuse_table)[name] = name; +} + +// Free a tensor and make it resuable. +// @tensor: the tensor to free. +// @free_existing_tensors: the free and allocated tensors. +// @reuse_table: a map from a fake tensor to the existing allocated tensor. +void FreeATensor(const std::string& tensor, + std::unordered_set* free_existing_tensors, + std::unordered_map* reuse_table) { + if (tensor == "feed" || tensor == "fetch") return; + // the really allocated tensor. + const auto& free_tensor = reuse_table->at(tensor); + + free_existing_tensors->insert(free_tensor); +} + +// Reuse a free existing tensor. +void ReuseATensor(const std::string& tensor, const std::string& tensor2reuse, + size_t memory_size, + std::unordered_set* free_existing_tensors, + std::unordered_map* reuse_table, + space_table_t* reused_space_table) { + auto it = free_existing_tensors->find(tensor2reuse); + PADDLE_ENFORCE(it != free_existing_tensors->end()); + free_existing_tensors->erase(it); + (*reuse_table)[tensor] = tensor2reuse; + // Update the memory size of a reused tensor, the memory will grow if the + // required memory is larger. + (*reused_space_table)[tensor2reuse] = + std::max(reused_space_table->at(tensor2reuse), memory_size); +} + +// Calculate the memory usage. +void EvaluateMemoryUsage( + const std::unordered_map& reuse_table, + const space_table_t& space_table, + const std::unordered_map& var_batch_ave_size, + size_t* allocated, size_t* saved) { + *allocated = 0; + *saved = 0; + + for (auto elem : reuse_table) { + if (elem.first == elem.second) { + *allocated += space_table.at(elem.first); + VLOG(4) << elem.first << " <-> " << elem.second << " " + << space_table.at(elem.first) << " " + << space_table.at(elem.second); + } else { + *saved += space_table.at(elem.first); + VLOG(4) << "reuse " << elem.first << " -> " << elem.second; + } + } + VLOG(4) << "allocated " << *allocated; + VLOG(4) << "saved " << *saved; +} + +// Return saved ratio. +void MemoryOptimizePass::MakeReusePlan( + const std::vector>& var_clusters, + const std::unordered_map& var_batch_ave_size, + const space_table_t& space_table, + std::unordered_map* reuse_table, int sort_kind, + MemoryAllocation* memory_allocation) const { + // Clear the existing plan. + reuse_table->clear(); + + // The `space_table` stores the real memory size for each tensor. + // The `reused_space_table` stores the maximum memory size required by a + // tensor during the memory reusing, the small tensor might be reused by a + // larger tensor, and the memory size of the small one will grow. + auto reused_space_table = space_table; + + std::unordered_map life_cycles; + std::unordered_map tensor_nodes; + // The allocated tensors whose memory can be reused, they will live across the + // program execution. + std::unordered_set existing_tensors; + // The existing tensor that has been allocated, and is also free to reuse. + std::unordered_set free_existing_tensors; + + CollectLifeCycle(&life_cycles, sort_kind); + + for (int age = 0; age < max_lifecycle_; ++age) { + std::unordered_set born_tensors; + std::unordered_set dead_tensors; + // Gather the dead and born tensors. + for (auto elem_it = life_cycles.begin(); elem_it != life_cycles.end(); + elem_it++) { + if (elem_it->second.first == -1) { + continue; + } + const auto& tensor = elem_it->first; + const auto& lifecycle = elem_it->second; + VLOG(4) << "process " << tensor << " reuse " << lifecycle.first << "->" + << lifecycle.second; + + // Collect newly born tensors. + if (lifecycle.first == age) { + born_tensors.insert(tensor); + } + // Collect dead tensors whose memory can be reused. + else if (lifecycle.second < age) { // NOLINT + dead_tensors.insert(tensor); + // remove to avoid duplicate process. + elem_it->second.first = -1; // avoid duplicate search + } + } + + // Reuse the dead tensors for born tensors + for (const auto& tensor : born_tensors) { + // Skip the feed and fetch tensor for that they share data with others. + std::string tensor2reuse; + if (!space_table.count(tensor)) continue; + size_t space_required = space_table.at(tensor); + if (FindSuitableTensorToReuse(tensor, space_required, tensor_nodes, + &free_existing_tensors, reused_space_table, + var_clusters, &tensor2reuse)) { + if (tensor != tensor2reuse) { + VLOG(4) << tensor << " -> " << tensor2reuse; + } + ReuseATensor(tensor, tensor2reuse, space_required, + &free_existing_tensors, reuse_table, &reused_space_table); + } else { + VLOG(4) << "allocate " << tensor; + AllocateNewTensor(tensor, space_required, tensor_nodes, + &free_existing_tensors, &reused_space_table, + reuse_table); + ReuseATensor(tensor, tensor, space_required, &free_existing_tensors, + reuse_table, &reused_space_table); + } + } + + for (const auto& tensor : dead_tensors) { + // free its memory. + FreeATensor(tensor, &free_existing_tensors, reuse_table); + } + } + + EvaluateMemoryUsage(*reuse_table, reused_space_table, var_batch_ave_size, + &(memory_allocation->allocated), + &(memory_allocation->saved)); + memory_allocation->sort_kind = sort_kind; +} + +void BuildVarNodeTable(Graph* graph, + std::unordered_map* var_node_table) { + for (auto* node : graph->Nodes()) { + if (node->IsVar()) { + (*var_node_table)[node->Name()] = node; + } + } +} + +// NOTE The optimized opdesc doesn't match ir::Graph. +void UpdateOpDescsByReuse( + Graph* graph, + const std::unordered_map& reuse_table, + int sort_kind) { + // TODO(Superjomn) change here to be compatible with the runtime order. + for (auto* node : TopologyVarientSort( + *graph, static_cast(sort_kind))) { + if (node->IsOp()) { + // Replace the original inputs/outputs with the reused tensors. + std::unordered_map> in_args, + out_args; + for (auto argument : node->Op()->Inputs()) { + for (const auto& x : argument.second) { + auto name = x; + if (reuse_table.count(x) && reuse_table.at(x) != x) { + name = reuse_table.at(x); + } + in_args[argument.first].push_back(name); + VLOG(4) << node->Name() << " input " << x << " -> " << name; + } + } + + for (auto argument : node->Op()->Outputs()) { + for (const auto& x : argument.second) { + auto name = x; + if (reuse_table.count(x) && reuse_table.at(x) != x) { + name = reuse_table.at(x); + } + out_args[argument.first].push_back(name); + VLOG(4) << node->Name() << " output " << x << " -> " << name; + } + } + + // Update arguments. + for (auto& arg : in_args) { + node->Op()->SetInput(arg.first, arg.second); + } + for (auto& arg : out_args) { + node->Op()->SetOutput(arg.first, arg.second); + } + node->Op()->Flush(); + } + } +} + +void MemoryOptimizePass::PerformReusePlan( + const std::unordered_map& reuse_table, + int sort_kind, std::unordered_set* vars2remove) const { + std::unordered_map var_node_table; + BuildVarNodeTable(graph_, &var_node_table); + UpdateOpDescsByReuse(graph_, reuse_table, sort_kind); + + for (auto& item : reuse_table) { + if (item.first != item.second) { + vars2remove->insert(item.first); + } + } + VLOG(2) << "to remove vars " << vars2remove->size(); +} + +std::vector split(const std::string& line, char delim) { + std::vector res; + std::string field; + std::stringstream line_stream(line); + while (std::getline(line_stream, field, delim)) { + res.emplace_back(field); + } + return res; +} + +// Deserialize the batch var shapes from the cache file. +std::vector>> DeseralizeBatchVarShapes( + const std::string& path) { + std::ifstream file(path); + PADDLE_ENFORCE(file.is_open(), "failed to open %s to read cache", path); + std::string line; + std::vector>> batch_shapes; + + while (std::getline(file, line)) { + std::map> batch; + for (const auto& var_info : split(line, ';')) { + auto fields = split(var_info, ':'); + PADDLE_ENFORCE_EQ(fields.size(), 2UL); + auto var_name = fields.front(); + auto shape_str = split(fields[1], ','); + std::vector shape; + for (const auto& v : shape_str) shape.push_back(std::stoi(v)); + batch[var_name] = shape; + } + batch_shapes.push_back(batch); + } + return batch_shapes; +} + +// Calculate the average dim of each tensor from the batch shape cache. +std::unordered_map GetBatchAverageSize( + const std::vector>>& batches) { + std::unordered_map var2size; + // The average size of the batches for each variable. + int num_batch = 0; + for (const auto& batch : batches) { + num_batch++; + for (const auto& item : batch) { + int dim = std::accumulate(item.second.begin(), item.second.end(), 1, + [](int a, int b) { return a * b; }); + var2size[item.first] += dim; + } + } + + for (auto& item : var2size) { + item.second /= num_batch; + } + + return var2size; +} + +// Analysis the batch shapes loading from the cache file. +// By splitting the variables to different clusters by analyzing their batch +// size, we can pre-schedule the changes of difference LoDTensor when different +// length of input sequences is entered. +// This should works fine for the models operating on sentences. +std::vector> AnalysisBatchShapesByBatchSize( + const std::vector>>& batches) { + // collect the batch size of each shape and combine to a stringstream in + // converient to generate a hash. + std::unordered_map var_batchsize_hashes; + for (auto& batch : batches) { + for (auto& ele : batch) { + int batch_size = ele.second.front(); + // TODO(Superjomn) might consume large memory here, use combine hash. + var_batchsize_hashes[ele.first] << batch_size; + } + } + + // Split to sets by batch size sequences. + std::unordered_map> + shape_sets; + for (auto& ele : var_batchsize_hashes) { + auto hash = std::hash()(ele.second.str()); + shape_sets[hash].insert(ele.first); + } + std::vector> res; + for (auto& ele : shape_sets) { + res.emplace_back(std::move(ele.second)); + } + + VLOG(3) << "Cluster by batch_size and get " << res.size() << " clusters"; + return res; +} + +// Analysis the batch shapes loading from the cache file, and split them to +// different clusters by their size. +// This should works fine for the overall models. +std::vector> AnalysisBatchShapesBySimilarSize( + const space_table_t& space_table, + const std::vector>>& batches, + int interval = 200000) { + PADDLE_ENFORCE_GT(interval, 0); + // cluster to different clusters. + size_t max_size = 0; + for (auto& item : space_table) { + max_size = std::max(item.second, max_size); + } + VLOG(4) << "tensor max size " << max_size; + + std::vector> res; + + // cluster by intervals. + for (size_t interval_size = 0; interval_size <= max_size; + interval_size += interval) { + std::unordered_set cluster; + for (auto& item : space_table) { + if (interval_size <= item.second && + interval_size + interval > item.second) { + cluster.insert(item.first); + } + } + if (!cluster.empty()) { + res.push_back(cluster); + } + } + + VLOG(3) << "Cluster by interval and get " << res.size() << " cluster"; + return res; +} + +std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } + +void MemoryOptimizePass::RunImpl(Argument* argument) { + // When force update, should not optimize memory. + if (!argument->enable_memory_optim() || argument->memory_optim_force_update()) + return; + graph_ = argument->main_graph_ptr(); + + auto path = GetMemoryCachePath( + argument->model_dir_valid() ? argument->model_dir() : "", + argument->model_program_path_valid() ? argument->model_program_path() + : ""); + VLOG(3) << "Load memory cache from " << path; + if (inference::IsFileExists(path)) { + VLOG(4) << "Performing memory optimize"; + auto batches = DeseralizeBatchVarShapes(path); + auto var_batch_ave_size = GetBatchAverageSize(batches); + + std::unordered_map tensor_nodes; + space_table_t space_table; + CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); + + std::unordered_map reuse_table; + double max_saving_ratio = 0.; + + std::vector> strategies; + + for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_batch_size = + AnalysisBatchShapesByBatchSize(batches); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_batch_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, 1024); // interval 1kb + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, 1024 * 1024); // interval 1MB + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, + std::numeric_limits::max()); // no intervals + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + } + + std::function* best_strategy{nullptr}; + + // Try all strategies to get the best result. + for (auto& strategy : strategies) { + auto allocation = strategy(); + string::PrettyLogDetail("--- get strategy saving %f memory for workspace", + allocation.GetSavingRatio()); + if (allocation.GetSavingRatio() > max_saving_ratio) { + max_saving_ratio = allocation.GetSavingRatio(); + best_strategy = &strategy; + } + } + if (!best_strategy) { + LOG(ERROR) + << "This model makes poor memory optimize, skip memory optimize"; + return; + } + auto memory_allocation = (*best_strategy)(); + + string::PrettyLogH2( + "--- Saved %.2f%s memory for workspace(temporary variables)", + memory_allocation.GetSavingRatio() * 100, "%"); + string::PrettyLogDetail("--- Allocated %d MB", + memory_allocation.allocated / 1024. / 1024.); + string::PrettyLogDetail("--- Saved %d MB", + memory_allocation.saved / 1024. / 1024.); + argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, + new std::unordered_set); + auto& vars2remove = + argument->main_graph().Get>( + framework::ir::kGraphToProgramVarsToRemove); + + PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); + argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); + } +} + +float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { + return (saved / 1024.) / (allocated / 1024. + saved / 1024.); +} +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h new file mode 100644 index 0000000000..fa1ad9c8c6 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Memory optimization pass for inference with pre-analysis of memory usage + * without GC. + * Different from training, the inference memory reuse strategies doesn't + * include GC for that overhead is too much when batch size equals one. + * + * The inference memory reuse tries to pre-determine the tensor reusing strategy + * without runtime overhead. + * + * To improve the strategy's performance, a warm-up running is introduced: + * - Before officially deploy the inference program, one should warm it up and + * generate some runtime cache, + * - Run the inference program with several batches of data, it will persist + * some runtime information about memory of tensors to disk, we call the + * information the memory reusing cache, + * - With the memory reusing cache, user can deploy the inference to a + * service, before running the model, the inference program will load the + * memory cache, analysis it and generate the best memory reusing strategy, + * and adjust the execution of the network. + * + * With the warm-up and memory reusing cache design, the memory reusing + * algorithm can analysis the real memory consume of the tensors, even with the + * flexible LoDTensor and special shape changing operators such as + * sequence-pooling. + */ +class MemoryOptimizePass : public AnalysisPass { + public: + using space_table_t = std::unordered_map; + using lifecycle_t = std::pair; + + struct MemoryAllocation { + size_t allocated; // allocated memory in byte. + size_t saved; // saved memory in byte. + int sort_kind; // the kind of the corresponding sorting algorithm. + + // Get the memory saving ratio of temporary variables. + float GetSavingRatio() const; + }; + + virtual ~MemoryOptimizePass() = default; + + protected: + void RunImpl(Argument *argument) override; + + private: + void CollectLifeCycle( + std::unordered_map *lifecycles, + int sort_kind) const; + + void CollectVarMemorySize( + const std::unordered_map &batch_var_ave_dim, + std::unordered_map *tensor_nodes, + space_table_t *space_table) const; + + // Returns percentage of saved memory. + void MakeReusePlan( + const std::vector> &var_clusters, + const std::unordered_map &var_batch_ave_size, + const space_table_t &space_table, + std::unordered_map *reuse_table, int sort_kind, + MemoryAllocation *memory_allocation) const; + + void PerformReusePlan( + const std::unordered_map &reuse_table, + int sort_kind, std::unordered_set *vars2remove) const; + + public: + std::string repr() const override; + + private: + mutable framework::ir::Graph *graph_{nullptr}; + mutable int max_lifecycle_{-1}; +}; + +static std::string GetMemoryCachePath(const std::string &model_path, + const std::string &prog_path) { + auto path = model_path.empty() ? prog_path : model_path; + return path + ".memory_cache"; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc index 9245e32cee..161b127d6d 100644 --- a/paddle/fluid/inference/analysis/passes/passes.cc +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -13,24 +13,31 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/passes.h" -#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" namespace paddle { namespace inference { namespace analysis { + PassRegistry::PassRegistry() { + // Register manually to avoid the trivial `USE_OP` like macro for easier use + // and link. passes_.emplace("ir_analysis_pass", std::unique_ptr(new IrAnalysisPass)); passes_.emplace("ir_graph_build_pass", std::unique_ptr(new IrGraphBuildPass)); - passes_.emplace("ir_analysis_compose_pass", - std::unique_ptr(new IrAnalysisComposePass)); + passes_.emplace("memory_optimize_pass", + std::unique_ptr(new MemoryOptimizePass)); passes_.emplace( "ir_params_sync_among_devices_pass", std::unique_ptr(new IrParamsSyncAmongDevicesPass)); + passes_.emplace( + "ir_graph_to_program_pass", + std::unique_ptr(new IrGraphToProgramPass)); } } // namespace analysis diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 8b3838f69a..ad0af4005a 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,8 +18,10 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass - ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB}) +set(inference_deps ${analysis_deps} + paddle_inference_api paddle_fluid_api + analysis pass naive_executor + ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) @@ -29,7 +31,8 @@ add_subdirectory(details) cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor + reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps}) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder zero_copy_tensor @@ -44,7 +47,7 @@ if(WITH_TESTING) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) endif() -cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) if (WITH_ANAKIN AND WITH_MKL) # only needed in CI diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 965bbd0fd2..f9da3004ed 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const { contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) { model_dir_ = model_dir; + + Update(); } contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file, const std::string ¶ms_file) { prog_file_ = prog_file; params_file_ = params_file; + + Update(); } void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path, const std::string ¶ms_file_path) { prog_file_ = prog_file_path; params_file_ = params_file_path; + + Update(); } void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id) { @@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, memory_pool_init_size_mb_ = memory_pool_init_size_mb; device_id_ = device_id; #else - LOG(ERROR) << "Please compile with gpu to EnableGpu"; + LOG(ERROR) << "Please compile with gpu to EnableGpu()"; use_gpu_ = false; #endif + + Update(); +} +void contrib::AnalysisConfig::DisableGpu() { + use_gpu_ = false; + + Update(); } -void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; } contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { #define CP_MEMBER(member__) member__ = other.member__; @@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { CP_MEMBER(use_gpu_); CP_MEMBER(device_id_); CP_MEMBER(memory_pool_init_size_mb_); + + CP_MEMBER(enable_memory_optim_); + CP_MEMBER(memory_optim_force_update_); // TensorRT releated. CP_MEMBER(use_tensorrt_); CP_MEMBER(tensorrt_workspace_size_); @@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { } #undef CP_MEMBER + + Update(); } void contrib::AnalysisConfig::EnableMKLDNN() { @@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() { LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; use_mkldnn_ = false; #endif + + Update(); } void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, int max_batch_size, int min_subgraph_size) { +#ifdef PADDLE_WITH_CUDA + if (!use_gpu()) { + LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; + return; + } + use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; + Update(); +#else + LOG(ERROR) + << "To use TensorRT engine, please compile inference lib with GPU first."; +#endif } +// TODO(Superjomn) refactor this, buggy. void contrib::AnalysisConfig::Update() { auto info = SerializeInfoCache(); if (info == serialized_info_cache_) return; - if (use_gpu_) { - pass_builder_.reset(new GpuPassStrategy); + // Transfer pass_builder and copy the existing compatible passes. + if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) { + if (use_gpu()) { + pass_builder_.reset(new GpuPassStrategy); + + if (use_tensorrt_) { + // Append after the Affine_channel_conv_fuse pass. + pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); + } + } else { + pass_builder_.reset(new CpuPassStrategy); + } + } else { - pass_builder_.reset(new CpuPassStrategy); + if (use_gpu()) { + pass_builder_.reset(new GpuPassStrategy( + *static_cast(pass_builder_.get()))); + + } else { + pass_builder_.reset(new CpuPassStrategy( + *static_cast(pass_builder_.get()))); + } } if (use_tensorrt_) { - if (!use_gpu_) { - LOG(ERROR) - << "TensorRT engine is not available when EnableGpu() not actived."; - } else { + const auto &passes = pass_builder_->AllPasses(); + if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") == + std::end(passes)) { // Append after the Affine_channel_conv_fuse pass. pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } @@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() { #endif } + if (enable_memory_optim_) { + pass_builder()->AppendAnalysisPass("memory_optimize_pass"); + } + if (ir_debug_) { pass_builder()->TurnOnDebug(); } @@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() { std::string contrib::AnalysisConfig::SerializeInfoCache() { std::stringstream ss; + ss << model_dir_; + ss << prog_file_; + ss << params_file_; + ss << use_gpu_; + ss << device_id_; ss << memory_pool_init_size_mb_; ss << use_tensorrt_; ss << tensorrt_workspace_size_; ss << tensorrt_max_batchsize_; + ss << tensorrt_min_subgraph_size_; + + ss << enable_memory_optim_; + ss << memory_optim_force_update_; ss << use_mkldnn_; + for (auto &item : mkldnn_enabled_op_types_) ss << item; + ss << ";"; + + ss << model_from_memory_; + ss << enable_ir_optim_; ss << use_feed_fetch_ops_; ss << ir_debug_; + ss << specify_input_name_; + ss << cpu_math_library_num_threads_; + return ss.str(); } void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads( int cpu_math_library_num_threads) { cpu_math_library_num_threads_ = cpu_math_library_num_threads; + + Update(); } float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { @@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #endif } +void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) { + enable_memory_optim_ = true; + memory_optim_force_update_ = force_update_cache; + + Update(); +} + +bool contrib::AnalysisConfig::enable_memory_optim() const { + return enable_memory_optim_; +} + void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, size_t prog_buffer_size, const char *param_buffer, @@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); model_from_memory_ = true; + + Update(); } } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3917b9b65b..2b0cad5faa 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -24,18 +24,21 @@ #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" -#if PADDLE_WITH_TENSORRT -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#endif #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" +#if PADDLE_WITH_TENSORRT +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#endif + DECLARE_bool(profile); namespace paddle { @@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector &inputs, LOG(ERROR) << "fail to get fetches"; return false; } + + // Collect variable shapes for memory optimization. + if (need_collect_var_shapes_for_memory_optim()) { + CollectVarShapes(); + } + VLOG(3) << "predict cost: " << timer.toc() << "ms"; // All the containers in the scope will be hold in inference, but the @@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); + argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); + argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir().empty()) { @@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { + LOG(INFO) << "TensorRT subgraph engine is enabled"; argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); @@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } if (config_.use_mkldnn_) { + LOG(INFO) << "MKLDNN is enabled"; argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); } auto passes = config_.pass_builder()->AllPasses(); - if (!config_.ir_optim()) passes.clear(); + if (!config_.ir_optim()) { + passes.clear(); + LOG(INFO) << "ir_optim is turned off, no IR pass will be executed"; + } argument_.SetIrAnalysisPasses(passes); + argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetScopeNotOwned(const_cast(scope_.get())); Analyzer().Run(&argument_); @@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() { if (sub_scope_) { scope_->DeleteScope(sub_scope_); } + + // TODO(Superjomn) deduce the directory path. + std::string out_path = inference::analysis::GetMemoryCachePath( + config_.model_dir(), config_.prog_file()); + if (need_collect_var_shapes_for_memory_optim()) { + SerializeBatchVarShapes(out_path); + } } std::unique_ptr AnalysisPredictor::Clone() { @@ -567,6 +591,66 @@ std::unique_ptr AnalysisPredictor::Clone() { return std::unique_ptr(x); } +void AnalysisPredictor::CollectVarShapes() { + VLOG(4) << "Collecting var shapes"; + if (batch_var_shapes_.size() >= max_shape_collect_count_) return; + std::map> var_shapes; + for (auto var_name : inference_program_->Block(0).LocalVarNames()) { + auto *var = sub_scope_->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var); + if (var->Type() == framework::VarTypeTrait::kId || + var->Type() == framework::VarTypeTrait::kId) { + auto &tensor = var->Get(); + auto shape = framework::vectorize(tensor.dims()); + var_shapes[var_name].assign(shape.begin(), shape.end()); + } + } + batch_var_shapes_.push_back(var_shapes); + LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size() + << " batch of var shapes for analysis"; +} + +void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) { + LOG(INFO) << "serialize batch var shapes to " << path; + std::ofstream file(path); + if (!file.is_open()) { + LOG(ERROR) << "failed to serialize the var shapes to " << path; + return; + } + + // The sirialized data format: + // :dim0,dim1,dim2,; + for (auto &batch : batch_var_shapes_) { + for (auto &ele : batch) { + file << ele.first << ":"; + for (size_t i = 0; i < ele.second.size() - 1; i++) { + file << ele.second[i] << ","; + } + file << ele.second.back() << ";"; + } + file << "\n"; + } +} + +bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { + if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_; + bool need = false; + // check if the cache exists + if (!config_.enable_memory_optim()) { + need = false; + } else if (config_.enable_memory_optim() && + !inference::IsFileExists(inference::analysis::GetMemoryCachePath( + config_.model_dir(), config_.prog_file()))) { + need = true; + } else if (config_.enable_memory_optim() && + config_.memory_optim_force_update_) { + need = true; + } + + need_collect_var_shapes_ = need ? 1 : 0; + return need; +} + template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnalysisConfig &config) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 6ca4b5e9be..e25b5a7047 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor { void SetMkldnnThreadID(int tid); protected: + // For memory optimization. + bool need_collect_var_shapes_for_memory_optim(); + void CollectVarShapes(); + void SerializeBatchVarShapes(const std::string &path); + bool PrepareProgram(const std::shared_ptr &program); bool PrepareScope(const std::shared_ptr &parent_scope); bool CreateExecutor(); @@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor { // A mutex help to make Clone thread safe. std::mutex clone_mutex_; + // For memory optimization. + const size_t max_shape_collect_count_{1000}; + int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true. + std::vector>> batch_var_shapes_; + private: // Some status here that help to determine the status inside the predictor. bool status_program_optimized_{false}; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 3df26cde3d..4688e93d71 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -16,8 +16,10 @@ #include #include #include // NOLINT +#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" DEFINE_string(dirname, "", "dirname to tests."); @@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) { } } +TEST(AnalysisPredictor, memory_optim) { + AnalysisConfig config(FLAGS_dirname); + config.DisableGpu(); + config.EnableMemoryOptim(true); + config.pass_builder()->TurnOnDebug(); + + auto native_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); + + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector output, output1; + + { + // The first predictor help to cache the memory optimize strategy. + auto predictor = CreatePaddlePredictor(config); + + // Run several times to check the parameters are not reused by mistake. + for (int i = 0; i < 5; i++) { + ASSERT_TRUE(predictor->Run(inputs, &output)); + } + } + + { + output.clear(); + // The second predictor to perform memory optimization. + config.EnableMemoryOptim(false); + auto predictor = CreatePaddlePredictor(config); + + // Run with memory optimization + ASSERT_TRUE(predictor->Run(inputs, &output)); + } + + // Run native + ASSERT_TRUE(native_predictor->Run(inputs, &output1)); + + LOG(INFO) << "the output " << inference::DescribeTensor(output.front()); + LOG(INFO) << "the native output " + << inference::DescribeTensor(output1.front()); + + inference::CompareResult(output, output1); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 9811fe2cd0..963986f245 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -1,3 +1,4 @@ +#!/bin/bash set -x PADDLE_ROOT=$1 TURN_ON_MKL=$2 # use MKL or Openblas diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index cdd01cb9f0..b92781e4f2 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -15,7 +15,10 @@ #pragma once #include - +#include +#if !defined(_WIN32) +#include +#endif #include #include // NOLINT #include @@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { return true; } -static std::string DescribeTensor(const PaddleTensor &tensor) { +static std::string DescribeTensor(const PaddleTensor &tensor, + int max_num_of_data = 15) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; os << " - type: "; @@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, } } +static bool IsFileExists(const std::string &path) { + std::ifstream file(path); + bool exists = file.is_open(); + file.close(); + return exists; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index ae6ac69854..1cee890450 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -192,6 +192,13 @@ struct AnalysisConfig { */ bool model_from_memory() const { return model_from_memory_; } + /** Turn on memory optimize + * NOTE still in development, will release latter. + */ + void EnableMemoryOptim(bool force_update_cache = false); + /** Tell whether the memory optimization is activated. */ + bool enable_memory_optim() const; + friend class ::paddle::AnalysisPredictor; /** NOTE just for developer, not an official API, easily to be broken. @@ -232,6 +239,10 @@ struct AnalysisConfig { // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; + // memory reuse related. + bool enable_memory_optim_{false}; + bool memory_optim_force_update_{false}; + bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index bc3ce72f08..039389a4cf 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/api/paddle_pass_builder.h" + #include namespace paddle { @@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } +void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { + analysis_passes_.push_back(pass); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index efe1ba106a..d3a60d2099 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -45,6 +45,9 @@ class PaddlePassBuilder { /** Delete all the passes that has type `pass_type`. */ void DeletePass(const std::string &pass_type); + /** Append an analysis pass. */ + void AppendAnalysisPass(const std::string &pass); + /** Visualize the computation graph after each pass by generating a DOT * language file, one can draw them with the Graphviz toolkit. */ @@ -54,8 +57,18 @@ class PaddlePassBuilder { std::string DebugString(); const std::vector &AllPasses() const { return passes_; } + std::vector AnalysisPasses() const { + auto passes = analysis_passes_; + // To make sure the ir_graph_to_program should be the last pass so any + // modication of IR will persist to the program. + passes.push_back("ir_graph_to_program_pass"); + return passes; + } protected: + std::vector analysis_passes_{ + {"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass"}}; std::vector passes_; }; @@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder { /** The MKLDNN control exists in both CPU and GPU mode, because there can be * still some CPU kernels running in CPU mode. */ - virtual void EnableMKLDNN() = 0; + virtual void EnableMKLDNN() {} bool use_gpu() const { return use_gpu_; } @@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder { protected: bool use_gpu_{false}; + bool use_mkldnn_{false}; }; /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode. @@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy { use_gpu_ = false; } + explicit CpuPassStrategy(const CpuPassStrategy &other) + : PassStrategy(other.AllPasses()) {} + virtual ~CpuPassStrategy() = default; void EnableMKLDNN() override { // TODO(Superjomn) Consider the way to mix CPU with GPU. #ifdef PADDLE_WITH_MKLDNN - passes_.insert(passes_.begin(), "mkldnn_placement_pass"); - - for (auto &pass : - std::vector({"depthwise_conv_mkldnn_pass", // - "conv_bias_mkldnn_fuse_pass", // - "conv3d_bias_mkldnn_fuse_pass", // - "conv_relu_mkldnn_fuse_pass", // - "conv_elementwise_add_mkldnn_fuse_pass"})) { - passes_.push_back(pass); + if (!use_mkldnn_) { + passes_.insert(passes_.begin(), "mkldnn_placement_pass"); + + for (auto &pass : std::vector( + {"depthwise_conv_mkldnn_pass", // + "conv_bias_mkldnn_fuse_pass", // + "conv3d_bias_mkldnn_fuse_pass", // + "conv_relu_mkldnn_fuse_pass", // + "conv_elementwise_add_mkldnn_fuse_pass"})) { + passes_.push_back(pass); + } } + use_mkldnn_ = true; +#else + use_mkldnn_ = false; #endif } - - CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {} }; /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. @@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy { use_gpu_ = true; } - GpuPassStrategy(const GpuPassStrategy &other) + explicit GpuPassStrategy(const GpuPassStrategy &other) : PassStrategy(other.AllPasses()) { use_gpu_ = true; } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index adbf98e9e8..423c39813f 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -19,7 +19,7 @@ endfunction() function(inference_analysis_api_test target install_dir filename) inference_analysis_test(${target} SRCS ${filename} - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt) endfunction() @@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 # normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL) +inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL) # small DAM set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index fc87e0a8d1..4ec9404ab4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -126,6 +126,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, std::string turn_mask_pre = "turn_mask_"; auto one_batch = data->NextBatch(); + PADDLE_ENFORCE(!one_batch.response.empty()); int size = one_batch.response[0].size(); CHECK_EQ(size, kMaxTurnLen); // turn tensor assignment @@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) { std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); @@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) { reinterpret_cast(&cfg), input_slots_all); } +// Compare result of NativeConfig and AnalysisConfig with memory optimization. +TEST(Analyzer_dam, compare_with_memory_optim) { + // The small dam will core in CI, but works in local. + if (FLAGS_max_turn_num == 9) { + contrib::AnalysisConfig cfg, cfg1; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + // Run the first time to force to update memory cache + SetConfig(&cfg); + cfg.EnableMemoryOptim(true); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); + + // Run second time to use the memory cache and perform memory optimization. + SetConfig(&cfg1); + cfg1.EnableMemoryOptim(); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg1), + input_slots_all); + } +} + TEST(Analyzer_dam, compare) { compare(); } + #ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); } #endif diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 7b448a3200..2db297e200 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -69,6 +69,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_Text_Classification, profile) { AnalysisConfig cfg; SetConfig(&cfg); + cfg.pass_builder()->TurnOnDebug(); std::vector outputs; std::vector> input_slots_all; @@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) { TEST(Analyzer_Text_Classification, compare) { AnalysisConfig cfg; SetConfig(&cfg); + cfg.EnableMemoryOptim(); std::vector> input_slots_all; SetInput(&input_slots_all); diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 5a77b53a85..f3e75ffbb5 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include #include "paddle/fluid/inference/tests/api/tester_helper.h" @@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) { FLAGS_infer_model + "/__params__"); cfg->DisableGpu(); cfg->SwitchIrDebug(); - cfg->SwitchSpecifyInputNames(); + cfg->SwitchSpecifyInputNames(false); // TODO(TJ): fix fusion gru cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); } @@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) { if (use_mkldnn) { cfg.EnableMKLDNN(); } + // cfg.pass_builder()->TurnOnDebug(); std::vector outputs; std::vector> input_slots_all; @@ -103,9 +105,8 @@ void profile(bool use_mkldnn = false) { size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); CHECK_EQ(numel, refer.data.size()); for (size_t i = 0; i < numel; ++i) { - CHECK_LT( - fabs(static_cast(output.data.data())[i] - refer.data[i]), - 1e-5); + EXPECT_NEAR(static_cast(output.data.data())[i], refer.data[i], + 1e-5); } } } diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index ac964dc0c8..d2ca1d0b00 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -15,6 +15,7 @@ #pragma once #include + #include #include #include // NOLINT @@ -28,9 +29,8 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" - #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/utils/benchmark.h" @@ -91,7 +91,7 @@ void CompareResult(const std::vector &outputs, float *pdata = static_cast(out.data.data()); float *pdata_ref = static_cast(ref_out.data.data()); for (size_t j = 0; j < size; ++j) { - EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy); + CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy); } break; } diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 9725c19032..5aca807ee3 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) { } } +TEST(TensorRT_mobilenet, profile) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + profile(model_dir, true, false); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h index 76a3dd2c29..a1304cf4e7 100644 --- a/paddle/fluid/inference/utils/benchmark.h +++ b/paddle/fluid/inference/utils/benchmark.h @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#pragma once +#pragma once #include #include #include diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc index eb25547408..80763160df 100644 --- a/paddle/fluid/inference/utils/benchmark_tester.cc +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -16,7 +16,7 @@ #include #include -using namespace paddle::inference; +using namespace paddle::inference; // NOLINT TEST(Benchmark, basic) { Benchmark benchmark; benchmark.SetName("key0"); @@ -36,4 +36,4 @@ TEST(Benchmark, PersistToFile) { benchmark.PersistToFile("1.log"); benchmark.PersistToFile("1.log"); benchmark.PersistToFile("1.log"); -} \ No newline at end of file +} diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 86b3114cb3..0dfed7f5cc 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase { << out_name; auto &feed_list = feed_var->Get(); + PADDLE_ENFORCE_LT(static_cast(col), feed_list.size()); auto &feed_item = feed_list.at(static_cast(col)); auto *out_item = out_var->GetMutable(); diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index 10c9eb80d0..da4c1f326f 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt, std::cerr << style << Sprintf(fmt, args...) << reset(); } +template +static void PrettyLogInfo(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::info(), fmt, args...); +} +template +static void PrettyLogDetail(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::detail(), fmt, args...); +} +template +static void PrettyLogH1(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::H1(), fmt, args...); +} +template +static void PrettyLogH2(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::H2(), fmt, args...); +} + } // namespace string } // namespace paddle From cf0a057981e3d2b9b5b8265f1c5f2b2a238d832c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 21 Jan 2019 16:24:25 +0800 Subject: [PATCH 041/123] add document for ctr reader test=develop --- python/paddle/fluid/contrib/reader/README.md | 15 +++++++++++++++ .../paddle/fluid/contrib/reader/ctr_reader.py | 19 ++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 python/paddle/fluid/contrib/reader/README.md diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md new file mode 100644 index 0000000000..9e4b7d1ce3 --- /dev/null +++ b/python/paddle/fluid/contrib/reader/README.md @@ -0,0 +1,15 @@ +## CTR READER + +An multi-thread cpp reader that has the same interface with py_reader. It +uses cpp multi-thread to read file and is much more faster then the Python read +thread in py_reader. + +Currently, it support two types of file: + - gzip + - plain text file + +and two types of data format: + - cvs data format is : + * label dense_fea,dense_fea sparse_fea,sparse_fea + - the svm data format is : + * label slot1:fea_sign slot2:fea_sign slot1:fea_sign diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py index aad8ded87d..cc10ab239b 100644 --- a/python/paddle/fluid/contrib/reader/ctr_reader.py +++ b/python/paddle/fluid/contrib/reader/ctr_reader.py @@ -54,8 +54,8 @@ def ctr_reader( feed_dict, file_type, # gzip or plain file_format, # csv or svm - dense_slot_indexs, - sparse_slot_indexs, + dense_slot_index, + sparse_slot_index, capacity, thread_num, batch_size, @@ -78,11 +78,20 @@ def ctr_reader( Note that :code:`Program.clone()` method cannot clone :code:`py_reader`. Args: + feed_dict(list(variable)): a list of data variable. + file_type('gzip'|'plain'): the type of the data file + file_format('csv'|'svm'): csv data or svm data format. + cvs data format is : + label dense_fea,dense_fea sparse_fea,sparse_fea + the svm data format is : + label slot1:fea_sign slot2:fea_sign slot1:fea_sign + dense_slot_index(list(int)): the index of dense slots + sparse_slot_index(list(int)): the index of sparse slots capacity(int): The buffer capacity maintained by :code:`py_reader`. thread_num(list|tuple): List of tuples which declaring data shapes. batch_size(list|tuple): List of strs which declaring data type. file_list(list|tuple): List of ints which declaring data lod_level. - slots(bool): Whether use double buffer or not. + slots(bool): slot id of all sparse feature name(basestring): The prefix Python queue name and Reader name. None will be generated automatically. @@ -116,8 +125,8 @@ def ctr_reader( 'file_list': file_list, 'file_type': file_type, 'file_format': file_format, - 'dense_slot_index': dense_slot_indexs, - 'sparse_slot_index': sparse_slot_indexs, + 'dense_slot_index': dense_slot_index, + 'sparse_slot_index': sparse_slot_index, 'sparse_slots': slots, 'ranks': [], 'lod_levels': [], From 561ae9d50712f2f415614326443e554d10475bcd Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 21 Jan 2019 18:03:19 +0800 Subject: [PATCH 042/123] remove legacy WITH_C_API option --- CMakeLists.txt | 17 ---------------- cmake/external/gflags.cmake | 10 --------- cmake/external/glog.cmake | 9 --------- cmake/external/mkldnn.cmake | 4 ---- cmake/external/mklml.cmake | 4 ---- cmake/external/openblas.cmake | 19 ------------------ cmake/external/protobuf.cmake | 9 --------- cmake/external/pslib.cmake | 4 ---- cmake/external/pslib_brpc.cmake | 4 ---- cmake/external/xxhash.cmake | 9 --------- cmake/external/zlib.cmake | 9 --------- paddle/scripts/README.md | 1 - paddle/scripts/paddle_build.sh | 29 +++------------------------ paddle/scripts/paddle_docker_build.sh | 1 - 14 files changed, 3 insertions(+), 126 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a51552d96a..bbf3acb8ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,6 @@ option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) -option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) @@ -119,10 +118,6 @@ if(ANDROID OR IOS) "Disable golang when cross-compiling for Android and iOS" FORCE) # Compile PaddlePaddle mobile inference library - if (NOT WITH_C_API) - set(WITH_C_API ON CACHE STRING - "Always compile the C_API when cross-compiling for Android and iOS" FORCE) - endif() set(MOBILE_INFERENCE ON) add_definitions(-DPADDLE_MOBILE_INFERENCE) endif() @@ -135,8 +130,6 @@ endif() if (WIN32) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) - set(WITH_C_API OFF CACHE STRING - "Disable C_API when compiling for Windows" FORCE) set(WITH_FLUID_ONLY ON CACHE STRING "Enable FLUID_ONLY when compiling for Windows" FORCE) endif() @@ -150,16 +143,6 @@ set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING "A path setting fluid inference shared and static libraries") -if (WITH_C_API AND WITH_PYTHON) - message(WARNING "It is suggest not embedded a python interpreter in Paddle " - "when using C-API. It will give an unpredictable behavior when using a " - "different Python interpreter from compiling.") -endif() - -if (WITH_C_API) - set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE) -endif() - if(MOBILE_INFERENCE) set(THIRD_PARTY_BUILD_TYPE MinSizeRel) else() diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 95ca16f57f..f3ca74faea 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -71,13 +71,3 @@ if (WIN32) set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib) endif(HAVE_SHLWAPI) endif (WIN32) - -IF(WITH_C_API) - INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags) - IF(ANDROID) - INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI}) - ELSE() - INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib) - ENDIF() -ENDIF() - diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 8cd0455c16..72a2f60191 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -78,12 +78,3 @@ ADD_DEPENDENCIES(glog extern_glog gflags) LINK_LIBRARIES(glog gflags) LIST(APPEND external_project_dependencies glog) - -IF(WITH_C_API) - INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog) - IF(ANDROID) - INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI}) - ELSE() - INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib) - ENDIF() -ENDIF() diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 03f0dee859..6a7be73f09 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -110,7 +110,3 @@ else(WIN32) endif(WIN32) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn) -IF(WITH_C_API) - INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) -ENDIF() - diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 43322a257a..2caff27357 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -74,7 +74,3 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) LIST(APPEND external_project_dependencies mklml) - -IF(WITH_C_API) - INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib) -ENDIF() diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index aeb976b840..019745aad0 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -92,25 +92,6 @@ IF(NOT ${CBLAS_FOUND}) ELSE() ENDIF(NOT WIN32) SET(CBLAS_PROVIDER openblas) - IF(WITH_C_API) - INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) - # Because libopenblas.a is a symbolic link of another library, thus need to - # install the whole directory. - IF(ANDROID) - SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI}) - ELSE() - SET(TMP_INSTALL_DIR third_party/openblas/lib) - ENDIF() - INSTALL(CODE "execute_process( - COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib - ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} - )" - ) - INSTALL(CODE "MESSAGE(STATUS \"Installing: \" - \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\" - )" - ) - ENDIF() ENDIF(NOT ${CBLAS_FOUND}) MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index e1e619e572..16fd9fac92 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -266,15 +266,6 @@ IF(NOT PROTOBUF_FOUND) SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE) - IF(WITH_C_API) - INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf) - IF(ANDROID) - INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI}) - ELSE() - INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib) - ENDIF() - ENDIF() - IF(CMAKE_CROSSCOMPILING) PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf) ELSE() diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index 3b495d78e2..b4ea268e5a 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -71,7 +71,3 @@ ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) LIST(APPEND external_project_dependencies pslib) - -IF(WITH_C_API) - INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib) -ENDIF() diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake index 7ff5a8aca1..8b43f2ef5c 100644 --- a/cmake/external/pslib_brpc.cmake +++ b/cmake/external/pslib_brpc.cmake @@ -71,7 +71,3 @@ ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) LIST(APPEND external_project_dependencies pslib_brpc) - -IF(WITH_C_API) - INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib) -ENDIF() diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index c3e1212d8f..a0f300c2e8 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -73,12 +73,3 @@ include_directories(${XXHASH_INCLUDE_DIR}) add_dependencies(xxhash extern_xxhash) LIST(APPEND external_project_dependencies xxhash) - -IF(WITH_C_API) - INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash) - IF(ANDROID) - INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI}) - ELSE() - INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib) - ENDIF() -ENDIF() diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index d350737537..6c8d79c25e 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -59,12 +59,3 @@ SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) ADD_DEPENDENCIES(zlib extern_zlib) LIST(APPEND external_project_dependencies zlib) - -IF(WITH_C_API) - INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib) - IF(ANDROID) - INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI}) - ELSE() - INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib) - ENDIF() -ENDIF() diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md index 9e8b135c1b..2772224506 100644 --- a/paddle/scripts/README.md +++ b/paddle/scripts/README.md @@ -69,7 +69,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. | | `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. | | `WITH_SWIG_PY` | ON | Build with SWIG python API support. | -| `WITH_C_API` | OFF | Build capi libraries for inference. | | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. | | `WITH_STYLE_CHECK` | ON | Check the code style when building. | | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu | diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index f58e392684..cbd39d7a5d 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -33,7 +33,6 @@ function print_usage() { ${BLUE}gen_doc_lib${NONE}: generate paddle documents library ${BLUE}html${NONE}: convert C++ source code into HTML ${BLUE}dockerfile${NONE}: generate paddle release dockerfile - ${BLUE}capi${NONE}: generate paddle CAPI package ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library ${BLUE}check_style${NONE}: run code style check ${BLUE}cicheck${NONE}: run CI tasks @@ -180,7 +179,6 @@ function cmake_gen() { -DWITH_AVX=${WITH_AVX:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} - -DWITH_C_API=${WITH_C_API:-OFF} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DCUDNN_ROOT=/usr/ @@ -217,7 +215,6 @@ EOF -DWITH_GOLANG=${WITH_GOLANG:-OFF} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \ - -DWITH_C_API=${WITH_C_API:-OFF} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ -DWITH_TESTING=${WITH_TESTING:-ON} \ @@ -706,21 +703,10 @@ EOF EOF } -function gen_capi_package() { - if [[ ${WITH_C_API} == "ON" ]]; then - capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output - rm -rf $capi_install_prefix - make DESTDIR="$capi_install_prefix" install - cd $capi_install_prefix/ - ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz - fi -} - function gen_fluid_lib() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build - if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then - cat < Date: Mon, 21 Jan 2019 18:13:14 +0800 Subject: [PATCH 043/123] remove legacy WITH_SWIG_PY option --- CMakeLists.txt | 2 - cmake/external/swig.cmake | 65 --- paddle/py_paddle/.gitignore | 2 - paddle/py_paddle/__init__.py | 24 - paddle/py_paddle/dataprovider_converter.py | 309 ----------- paddle/py_paddle/util.py | 578 --------------------- paddle/scripts/README.md | 1 - paddle/scripts/paddle_build.sh | 2 - python/CMakeLists.txt | 2 - 9 files changed, 985 deletions(-) delete mode 100644 cmake/external/swig.cmake delete mode 100644 paddle/py_paddle/.gitignore delete mode 100644 paddle/py_paddle/__init__.py delete mode 100644 paddle/py_paddle/dataprovider_converter.py delete mode 100644 paddle/py_paddle/util.py diff --git a/CMakeLists.txt b/CMakeLists.txt index bbf3acb8ad..b4a700f974 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,6 @@ option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FO option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) -option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) @@ -176,7 +175,6 @@ include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas include(external/mkldnn) # download, build, install mkldnn include(external/ngraph) # download, build, install nGraph -include(external/swig) # download, build, install swig include(external/boost) # download boost include(external/any) # download libn::any include(external/eigen) # download eigen3 diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake deleted file mode 100644 index de07703695..0000000000 --- a/cmake/external/swig.cmake +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -IF(NOT WITH_SWIG_PY) - return() -ENDIF() - -FIND_PACKAGE(SWIG) - -IF(NOT SWIG_FOUND) - # build swig as an external project - INCLUDE(ExternalProject) - - SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig) - SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig) - SET(SWIG_TARGET_VERSION "3.0.2") - SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41") - SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f") - - IF(WIN32) - # swig.exe available as pre-built binary on Windows: - ExternalProject_Add(swig - URL http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip - URL_MD5 ${SWIG_DOWNLOAD_WIN_MD5} - SOURCE_DIR ${SWIG_SOURCES_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_COMMAND "" - ) - SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE) - SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe CACHE FILEPATH "SWIG Executable" FORCE) - ELSE(WIN32) - # swig uses bison find it by cmake and pass it down - FIND_PACKAGE(BISON) - - # From SWIG configure - ExternalProject_Add(swig - GIT_REPOSITORY https://github.com/swig/swig.git - GIT_TAG rel-3.0.10 - PREFIX ${SWIG_SOURCES_DIR} - CONFIGURE_COMMAND cd && ./autogen.sh && ./configure - --prefix=${SWIG_INSTALL_DIR} --without-pcre - BUILD_COMMAND cd && make - INSTALL_COMMAND cd && make install - UPDATE_COMMAND "" - ) - - SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION}) - SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig) - ENDIF(WIN32) - - LIST(APPEND external_project_dependencies swig) -ENDIF(NOT SWIG_FOUND) diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore deleted file mode 100644 index 80d1f76fbc..0000000000 --- a/paddle/py_paddle/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -swig_paddle.py -_swig_paddle.so diff --git a/paddle/py_paddle/__init__.py b/paddle/py_paddle/__init__.py deleted file mode 100644 index 5504d1d50c..0000000000 --- a/paddle/py_paddle/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from util import DataProviderWrapperConverter -from dataprovider_converter import DataProviderConverter - -__all__ = [ - 'paddle', - 'DataProviderConverter', - 'DataProviderWrapperConverter', # for deprecated usage. - 'loadParameterFile' -] -util.monkeypatches() diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py deleted file mode 100644 index 43614b9779..0000000000 --- a/paddle/py_paddle/dataprovider_converter.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.trainer.PyDataProvider2 as dp2 -import collections -import swig_paddle -import numpy -import itertools -from functools import reduce - -__all__ = ['DataProviderConverter'] - - -class IScanner(object): - """ - The scanner will scan Python object two passes, then convert it to Paddle's - argument. - - In the first pass, `pre_scan` will be invoked by every data instance, and - then invoke `finish_pre_scan` to arguments. And the second pass do the same - thing except the functions changed to `scan`, `finish_scan`. - - During the first pass, a scanner may count the shape of input matrix and - allocate memory for this argument. Then fill the data into this argument - in second pass. - """ - - def __init__(self, input_type, pos): - self.input_type = input_type - if not isinstance(self.input_type, dp2.InputType): - raise ValueError("input type should be dataprovider2.InputType") - self.pos = pos - # data_in_gpu is used to indicate whether to create argument on GPU - # or not in GPU mode. Now if using one thread (trainer_count=1), - # trainer uses NeuralNetwork which needs to create argument on GPU - # before calling forward function. So, set data_in_gpu to True. - # Otherwise, trainer uses MultiGradientMachine which will transfer - # data from CPU to GPU in the forward function, set data_in_gpu to - # False in this case. - self.data_in_gpu = swig_paddle.isUsingGpu( - ) and swig_paddle.getTrainerCount() == 1 - - def pre_scan(self, dat): - """ - First pass scan method. During this method, the scanner could count the - data number, and get the total memory size this batch would use. - - :param dat: The python object. - """ - pass - - def finish_pre_scan(self, argument): - """ - Finish first scan pass. Allocate the memory. - - :param argument: Output arguments object. - :type argument: swig_paddle.Arguments - :param dat: Output arguments object. - :type dat: The Python object, numpy.array or List. - :return: - """ - pass - - def scan(self, dat): - """ - Second pass scan method. Copy the data to arguments. - - :param dat: The python object. - """ - pass - - def finish_scan(self, argument): - """ - Finish second pass. Finalize the resources, etc. - - :param argument: Output arguments object. - :type argument: swig_paddle.Arguments - """ - pass - - -class DenseScanner(IScanner): - """ - :type __mat__: numpy.ndarray - """ - - def __init__(self, input_type, pos): - IScanner.__init__(self, input_type, pos) - self.__mat__ = None - self.__shape__ = None - self.__height__ = 0 - self.__dim__ = 0 - - def pre_scan(self, dat): - self.__height__ += 1 - if self.__shape__ is None: - self.__shape__ = numpy.array(dat).shape - if len(self.__shape__) > 3: - raise ValueError( - "The dimension of input cannot be greater than 3.") - if len(self.__shape__) == 0: - raise ValueError( - "The input should be a vector, please check your input data." - ) - self.__dim__ = reduce(lambda x, y: x * y, self.__shape__) - if len(self.__shape__) == 1 and self.__dim__ != self.input_type.dim: - raise ValueError( - "The data size must be equal to it in data layer.") - else: - if self.__shape__ != numpy.array(dat).shape: - raise ValueError( - "The data shape must be same in one mini-batch.") - - def finish_pre_scan(self, argument): - self.__mat__ = numpy.ndarray( - shape=(self.__height__, self.__dim__), dtype=numpy.float32) - self.__height__ = 0 - - def scan(self, dat): - # It's better to use NumPy array for speed. - dat = numpy.array(dat) - dat = dat.flatten() - self.__mat__[self.__height__] = dat - self.__height__ += 1 - - def finish_scan(self, argument): - assert isinstance(argument, swig_paddle.Arguments) - if self.__mat__.dtype != numpy.float32: - self.__mat__ = self.__mat__.astype(numpy.float32) - m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, - self.data_in_gpu) - argument.setSlotValue(self.pos, m) - if len(self.__shape__) > 1: - # The last-two dimenstions are the frame height and width. - # For example, the layout is CHW for 3-D feature of image. - # The H and W are the frame height and width. - h, w = self.__shape__[-2:] - argument.setSlotFrameHeight(self.pos, h) - argument.setSlotFrameWidth(self.pos, w) - self.__shape__ = None - - -class SparseBinaryScanner(IScanner): - def __init__(self, input_type, pos): - IScanner.__init__(self, input_type, pos) - self.__rows__ = [0] - self.__cols__ = [] - self.__height__ = 0 - self.__value__ = [] - - def scan(self, dat): - self.extend_cols(dat) - self.__rows__.append(len(self.__cols__)) - self.__height__ += 1 - - def extend_cols(self, dat): - self.__cols__.extend(dat) - - def finish_scan(self, argument): - assert isinstance(argument, swig_paddle.Arguments) - m = swig_paddle.Matrix.createSparse( - self.__height__, - self.input_type.dim, - len(self.__cols__), - len(self.__value__) == 0, - False, # trans - False) # TODO supoort GPU - assert isinstance(m, swig_paddle.Matrix) - m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__) - argument.setSlotValue(self.pos, m) - - -class SparseFloatScanner(SparseBinaryScanner): - def __init__(self, input_type, pos): - SparseBinaryScanner.__init__(self, input_type, pos) - - def extend_cols(self, dat): - self.__cols__.extend((x[0] for x in dat)) - self.__value__.extend((x[1] for x in dat)) - - -class IndexScanner(IScanner): - def __init__(self, input_type, pos): - IScanner.__init__(self, input_type, pos) - self.__ids__ = None - self.__idx__ = 0 - - def pre_scan(self, dat): - self.__idx__ += 1 - - def finish_pre_scan(self, argument): - self.__ids__ = [0] * self.__idx__ - self.__idx__ = 0 - - def scan(self, dat): - self.__ids__[self.__idx__] = dat - self.__idx__ += 1 - - def finish_scan(self, argument): - ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu) - assert isinstance(argument, swig_paddle.Arguments) - argument.setSlotIds(self.pos, ids) - - -class SequenceScanner(IScanner): - def __init__(self, input_type, pos, inner_scanner, setter): - IScanner.__init__(self, input_type, pos) - self.__seq__ = [0] - self.__inner_scanner__ = inner_scanner - self.__setter__ = setter - - def pre_scan(self, dat): - for each in dat: - self.__inner_scanner__.pre_scan(each) - - def finish_pre_scan(self, argument): - self.__inner_scanner__.finish_pre_scan(argument) - - def scan(self, dat): - self.__seq__.append(self.__seq__[-1] + self.get_size(dat)) - for each in dat: - self.__inner_scanner__.scan(each) - - def finish_scan(self, argument): - seq = swig_paddle.IVector.create(self.__seq__, False) - self.__setter__(argument, self.pos, seq) - self.__inner_scanner__.finish_scan(argument) - - def get_size(self, dat): - if isinstance(self.__inner_scanner__, SequenceScanner): - return sum(self.__inner_scanner__.get_size(item) for item in dat) - else: - return len(dat) - - -class DataProviderConverter(object): - def __init__(self, input_types): - self.input_types = input_types - assert isinstance(self.input_types, collections.Sequence) - for each in self.input_types: - assert isinstance(each, dp2.InputType) - - def convert(self, dat, argument=None): - if argument is None: - argument = swig_paddle.Arguments.createArguments(0) - assert isinstance(argument, swig_paddle.Arguments) - argument.resize(len(self.input_types)) - - scanners = [ - DataProviderConverter.create_scanner(i, each_type) - for i, each_type in enumerate(self.input_types) - ] - - for each_sample in dat: - for each_step, scanner in itertools.izip(each_sample, scanners): - scanner.pre_scan(each_step) - - for scanner in scanners: - scanner.finish_pre_scan(argument) - - for each_sample in dat: - for each_step, scanner in itertools.izip(each_sample, scanners): - scanner.scan(each_step) - - for scanner in scanners: - scanner.finish_scan(argument) - - return argument - - def __call__(self, dat, argument=None): - return self.convert(dat, argument) - - @staticmethod - def create_scanner(i, each): - assert isinstance(each, dp2.InputType) - retv = None - if each.type == dp2.DataType.Dense: - retv = DenseScanner(each, i) - elif each.type == dp2.DataType.Index: - retv = IndexScanner(each, i) - elif each.type == dp2.DataType.SparseNonValue: - retv = SparseBinaryScanner(each, i) - elif each.type == dp2.DataType.SparseValue: - retv = SparseFloatScanner(each, i) - assert retv is not None - - if each.seq_type == dp2.SequenceType.SUB_SEQUENCE: - retv = SequenceScanner( - each, i, retv, - lambda a, p, seq: a.setSlotSubSequenceStartPositions(p, seq)) - - if each.seq_type in [ - dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE - ]: - retv = SequenceScanner( - each, i, retv, - lambda a, p, seq: a.setSlotSequenceStartPositions(p, seq)) - return retv diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py deleted file mode 100644 index 3ae8dbf964..0000000000 --- a/paddle/py_paddle/util.py +++ /dev/null @@ -1,578 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Some Useful method for py_paddle. -""" - -import swig_paddle -import os -import paddle.trainer.PyDataProviderWrapper -import paddle.proto.ParameterConfig_pb2 -import paddle.proto.ModelConfig_pb2 -import paddle.proto.TrainerConfig_pb2 -import weakref -import numpy -import struct -import sys -import copy - - -def initializePaddle(*args): - """ - To initialize paddle process. - :param args: Command line options, such as --use_gpu=0, etc. - :return: Nothing. - """ - old_argv = copy.deepcopy(sys.argv) - old_pypath = os.getenv("PYTHONPATH") - pypath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) - if old_pypath is not None: - pypath = os.pathsep.join([pypath, old_pypath]) - os.putenv("PYTHONPATH", pypath) - args = [""] + list(args) # argv[0] is command name, it is not important. - swig_paddle.__initPaddle__(args) - sys.argv = old_argv - - -def __monkeypatch_init_paddle__(): - swig_paddle.__initPaddle__ = swig_paddle.initPaddle - swig_paddle.initPaddle = initializePaddle - - -class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback): - """ - Wrap the python callable object to paddle.UpdateCallback. - - INTERNAL USE ONLY. - """ - - def __init__(self, callback): - swig_paddle.UpdateCallback.__init__(self) - self.callback = callback - - def apply(self, param): - self.callback(param) - - @staticmethod - def wrap(callback): - """ - Cast the python callable object/paddle.UpdateCallback to - swig_paddle.UpdateCallback.__disown__ - :param callback: callable or swig_paddle.UpdateCallback object. - """ - if isinstance(callback, swig_paddle.UpdateCallback): - return callback.__disown__() - elif isinstance(callback, weakref.ProxyType): - raise RuntimeError("Should not pass __disown__ object") - else: - return __ParameterCallbackWrapper__(callback).__disown__() - - -def __arguments_to_numpy__(i, arg): - assert isinstance(arg, swig_paddle.Arguments) - value = arg.getSlotValue(i) - ids = arg.getSlotIds(i) - prob = arg.getSlotIn(i) - if value is not None: - assert isinstance(value, swig_paddle.Matrix) - value = value.copyToNumpyMat() - if ids is not None: - assert isinstance(ids, swig_paddle.IVector) - ids = ids.copyToNumpyArray() - if prob is not None: - assert isinstance(prob, swig_paddle.Matrix) - prob = prob.copyToNumpyMat() - return {"value": value, "id": ids, "prob": prob} - - -def __monkeypatch_gradient_machine__(): - """ - Add some class methods to GradientMachine. - This method should be only used internally. - """ - swig_paddle.GradientMachine.loadFromConfigFile = \ - staticmethod(loadGradientMachine) - - def __matrix_to_numpy__(m): - if isinstance(m, swig_paddle.Matrix): - return m.copyToNumpyMat() - elif isinstance(m, swig_paddle.IVector): - return m.copyToNumpyArra() - else: - raise RuntimeError("Input arg should be matrix or vecotr.") - - def createFromConfigProto(protoObj, - createMode=swig_paddle.CREATE_MODE_NORMAL, - paramTypes=[ - swig_paddle.PARAMETER_VALUE, - swig_paddle.PARAMETER_GRADIENT, - swig_paddle.PARAMETER_MOMENTUM - ]): - """ - Create Gradient Machine From Proto object. - :param protoObj: Model config - :type protoObj: proto.ModelConfig_pb2.ModelConfig - :param createMode: Create Mode, default is normal. - :type createMode: int - :param paramTypes: the gradient machine parameter type. - :type paramTypes: list of int - :return: paddle.GradientMachine - """ - assert isinstance(protoObj, paddle.proto.ModelConfig) - return swig_paddle.GradientMachine.createByConfigProtoStr( - protoObj.SerializeToString(), createMode, paramTypes) - - swig_paddle.GradientMachine.createFromConfigProto = \ - staticmethod(createFromConfigProto) - - def forwardTest(self, inArgs): - """ - forwardTest. forward gradient machine in test mode, and return a numpy - matrix dict. - - :param inArgs: The input arguments - :type inArgs: paddle.Arguments - :return: A dictionary with keys ['id', 'value'], each value is a - numpy.ndarray. - """ - outArgs = swig_paddle.Arguments.createArguments(0) - self.forward(inArgs, outArgs, swig_paddle.PASS_TEST) - return [ - __arguments_to_numpy__(i, outArgs) - for i in xrange(outArgs.getSlotNum()) - ] - - swig_paddle.GradientMachine.forwardTest = forwardTest - - # Monkey patching backward - swig_paddle.GradientMachine.__backward__ = swig_paddle.GradientMachine.backward - - def backward(self, callback): - """ - GradientMachine Backward - :param callback: a callback which parameter is (paddle.Parameter) or - a paddle.UpdateCallback object. - """ - self.__backward__(__ParameterCallbackWrapper__.wrap(callback)) - - swig_paddle.GradientMachine.backward = backward - - # Monkey patching forwardBackward. - swig_paddle.GradientMachine.__forwardBackward__ = \ - swig_paddle.GradientMachine.forwardBackward - - def forwardBackward(self, - inArgs, - outArgs, - passType, - callback=swig_paddle.UpdateCallback()): - """ - GradientMachine forward backward. - :param inArgs: Input Arguments for GradientMachine. - :type inArgs: paddle.Arguments - :param outArgs: Output Arguments for GradientMachine. - :type outArgs: paddle.Arguments - :param passType: gradient machine's pass type. - :type passType: paddle.PassType - :param callback: a callable object with arguments (paddle.Parameter) or - a paddle.UpdateCallback it will be called when - backward - """ - self.__forwardBackward__(inArgs, outArgs, passType, - __ParameterCallbackWrapper__.wrap(callback)) - - swig_paddle.GradientMachine.forwardBackward = forwardBackward - - def getParameters(self): - return (self.getParameter(i) for i in xrange(self.getParameterSize())) - - swig_paddle.GradientMachine.getParameters = getParameters - - def getNonStaticParameters(self): - return (self.getNonStaticParameter(i) - for i in xrange(self.getNonStaticParameterSize())) - - swig_paddle.GradientMachine.getNonStaticParameters = getNonStaticParameters - - def getLayerOutputs(self, layerNames): - """ - getLayerOutputs. get outputs of layers and return a numpy matrix dict. - :param layerNames: layer names. - :type layerNames: string or list. - """ - if isinstance(layerNames, basestring): - layerNames = [layerNames] - elif not isinstance(layerNames, list): - raise RuntimeError("Input args shuld be string or a sting list.") - - output = dict() - for name in layerNames: - output[name] = __arguments_to_numpy__(0, self.getLayerOutput(name)) - return output - - swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs - - -def loadGradientMachine(config_filename, model_dir=None): - """ - Load a gradient machine from config file name/path. - :param config_filename: The trainer config file name/path - :param model_dir: The model parameter directory. None if same as the - directory of config_filename - :return: GradientMachine with some enhance methods. - :rtype: paddle.GradientMachine - """ - trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile( - config_filename) - assert isinstance(trainer_config, swig_paddle.TrainerConfig) - model_conf = trainer_config.getModelConfig() - network = swig_paddle.GradientMachine.createByModelConfig(model_conf) - assert isinstance(network, swig_paddle.GradientMachine) - if model_dir is None: - model_dir = os.path.dirname(config_filename) - network.loadParameters(model_dir) - return network - - -def loadParameterFile(fn): - """ - Load Paddle Parameter file to numpy.ndarray - :param fn: file name or file like object. - :type fn: str or file like object. - :return: numpy array - :rtype: numpy.ndarray - :raise: paddle.UnsupportError when parameter format is wrong. - """ - if isinstance(fn, str): - with open(fn, 'rb') as f: - return loadParameterFile(f) - elif hasattr(fn, 'read'): # File like object - version, = struct.unpack('i', fn.read(4)) - if version != 0: - raise swig_paddle.UnsupportError() - value_length, = struct.unpack("I", fn.read(4)) - if value_length != 4 and value_length != 8: - raise swig_paddle.UnsupportError() - dtype = 'float32' if value_length == 4 else 'float64' - param_size, = struct.unpack("L", fn.read(8)) - value = numpy.fromfile(fn, dtype) - if len(value) != param_size: - raise swig_paddle.UnsupportError() - return value - else: - raise swig_paddle.UnsupportError() - - -class DataProviderWrapperConverter(object): - """ - A class convert DataFormat from PyDataProvider Wrapper to - py_paddle.paddle.Arguemnts. - """ - - class DenseValueConverter(object): - """ - Internal class - """ - - def __init__(self, header_def): - self.__dim__ = header_def.dim - self.buf = [] - - def append(self, other): - assert len(other) == self.__dim__ - self.buf += other - - def __call__(self, slot_idx, arg): - mat = swig_paddle.Matrix.createDense(self.buf, - len(self.buf) / self.__dim__, - self.__dim__) - arg.setSlotValue(slot_idx, mat) - - class IdValueConverter(object): - """ - Internal class - """ - - def __init__(self, *args): - self.buf = [] - - def append(self, other): - assert isinstance(other, int) - self.buf.append(other) - - def __call__(self, slot_idx, arg): - arg.setSlotIds(slot_idx, swig_paddle.IVector.create(self.buf)) - - class SparseNonValueConverter(object): - """ - Internal class - """ - - def __init__(self, slot_def): - self.indices = [0] - self.cols = [] - self.dim = slot_def.dim - - def append(self, other): - self.indices.append(self.indices[-1] + len(other)) - self.cols += other - - def __call__(self, slot_idx, arg): - mat = swig_paddle.Matrix.createSparse( - len(self.indices) - 1, self.dim, len(self.cols), True) - assert isinstance(mat, swig_paddle.Matrix) - mat.sparseCopyFrom(self.indices, self.cols) - self.putIntoArg(slot_idx, arg, mat) - - def putIntoArg(self, slot_idx, arg, mat): - arg.setSlotValue(slot_idx, mat) - - class SparseValueConverter(SparseNonValueConverter): - """ - Internal class - """ - - def __init__(self, slot_def): - super(DataProviderWrapperConverter.SparseValueConverter, - self).__init__(slot_def) - self.values = [] - - def append(self, other): - super(DataProviderWrapperConverter.SparseValueConverter, - self).append(map(lambda x: x[0], other)) - self.values += map(lambda x: x[1], other) - - def __call__(self, slot_idx, arg): - mat = swig_paddle.Matrix.createSparse( - len(self.indices) - 1, self.dim, len(self.cols), False) - assert isinstance(mat, swig_paddle.Matrix) - mat.sparseCopyFrom(self.indices, self.cols, self.values) - self.putIntoArg(slot_idx, arg, mat) - - __SLOT_VALUE_CONVERTER_MAP__ = { - paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter, - paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter, - paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot: - SparseNonValueConverter, - paddle.trainer.PyDataProviderWrapper.SparseValueSlot: - SparseValueConverter - } - - def __init__(self, use_seq, header): - """ - Ctor - :param use_seq: True if use sequence. - :param header: List of slots type, - trainer.PyDataProviderWrapper.SlotType - """ - self.__use_seq__ = use_seq - self.__header__ = header - - def convert(self, wrapper_data, argument=None): - """ - Convert PyDataProviderWrapper format to paddle.Argument - :param wrapper_data: PyDataProviderWrapper yield's data list. - :param argument: The output paddle.Arguments. - If it is not None, it will assign data in this - arguments, else it will create new arguments. - :return: arguments that contains data. - :rtype: paddle.Arguments - """ - if argument is None: - argument = swig_paddle.Arguments.createArguments(0) - assert isinstance(argument, swig_paddle.Arguments) - argument.resize(len(self.__header__)) - - values = map( - lambda x: DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[x.__class__](x), - self.__header__) - - if self.__use_seq__: - seq_dim = [[] for _ in xrange(self.__header__.__len__())] - seq_start_pos = [[0] for _ in xrange(self.__header__.__len__())] - - for each_sample in wrapper_data: - for slot_idx, sequence in enumerate(each_sample): - for raw_data in sequence: - values[slot_idx].append(raw_data) - seq_start_pos[slot_idx].append(seq_start_pos[slot_idx][-1] + - len(sequence)) - seq_dim[slot_idx].append(len(sequence)) - - for slot_idx in xrange(len(self.__header__)): - argument.setSlotSequenceDim( - slot_idx, swig_paddle.IVector.create(seq_dim[slot_idx])) - argument.setSlotSequenceStartPositions( - slot_idx, - swig_paddle.IVector.create(seq_start_pos[slot_idx])) - else: - for each_sample in wrapper_data: - for raw_data, value in zip(each_sample, values): - value.append(raw_data) - - for i, v in enumerate(values): - v(i, argument) - - return argument - - def __call__(self, wrapper_data, argument=None): - """ - Invoke self.convert. See documents in self.convert. - """ - return self.convert(wrapper_data, argument) - - -def __monkey_patch_protobuf_objects__(): - def ParameterConfig_toProto(self): - """ - Convert paddle.ParameterConfig to - proto.ParameterConfig_pb2.ParameterConfig - - :return: proto.ParameterConfig_pb2.ParameterConfig object. - """ - param_conf = paddle.proto.ParameterConfig_pb2.ParameterConfig() - param_conf.ParseFromString(self.toProtoString()) - return param_conf - - swig_paddle.ParameterConfig.toProto = ParameterConfig_toProto - - def OptimizationConfig_toProto(self): - """ - Convert paddle.OptimizationConfig to - proto.TrainerConfig_pb2.OptimizationConfig - - :return: proto.TrainerConfig_pb2.OptimizationConfig - """ - opt_conf = proto.TrainerConfig_pb2.OptimizationConfig() - opt_conf.ParseFromString(self.toProtoString()) - return opt_conf - - swig_paddle.OptimizationConfig.toProto = OptimizationConfig_toProto - - def OptimizationConfig_createFromProto(protoObj): - """ - Create a new paddle.OptimizationConfig from - proto.TrainerConfig_pb2.OptimizationConfig - - :param protoObj: proto.TrainerConfig_pb2.OptimizationConfig - :return: paddle.OptimizationConfig - """ - - assert isinstance(protoObj, paddle.proto.OptimizationConfig) - return swig_paddle.OptimizationConfig.createFromProtoString( - protoObj.SerializeToString()) - - swig_paddle.OptimizationConfig.createFromProto = staticmethod( - OptimizationConfig_createFromProto) - - def TrainerConfig_createFromProto(protoObj): - """ - Create a new paddle.TrainerConfig from - proto.OptimizationConfig - - :param protoObj: proto.TrainerConfig - :return: paddle.TrainerConfig - """ - assert isinstance(protoObj, paddle.proto.TrainerConfig) - return swig_paddle.TrainerConfig.createFromProtoString( - protoObj.SerializeToString()) - - swig_paddle.TrainerConfig.createFromProto = staticmethod( - TrainerConfig_createFromProto) - - -def __monkey_patch_parameter__(): - def getBufs(self): - """ - get all parameter vectors. - NOTE: the return value is a generator. Maybe you need to cast to - list or tuple or something else. - - :return: generator of all parameter vectors. - :rtype: generator - """ - return (self.getBuf(i) for i in xrange(swig_paddle.NUM_PARAMETER_TYPES)) - - swig_paddle.Parameter.getBufs = getBufs - - -def __monkey_patch_trainer__(): - swig_paddle.Trainer.__create__ = staticmethod(swig_paddle.Trainer.create) - - def Trainer_create(config, model=None): - """ - Create a trainer for model with TrainerCOnfig trainer_config - trainer_config.model_config will be ignored when model is supplied. - Trainer.trainOneBatch() and Trainer.forwardOneBatch() can be used only - when trainer_config.data_config is set. - - A typical usage for Trainer is: - .. code-block:: python - trainer = Trainer.create(trainer_config, model) - for p in xrange(num_passes) - while True: - data = get_next_batch(batch_size) - if not data: - break - trainer.trainOneDataBatch(batch_size, data) - trainer.finishTrainPass() - trainer.finishTrain() - - The trainer will take care of logging, model saving, distributed - training, etc. - - :param config: trainer configuration - :type config: paddle.proto.TrainerConfig - :param model: the model to be trained - :type model: swig_paddle.GradientMachine - :return: a trainer - :rtype swig_paddle.Trainer - - """ - assert isinstance(config, paddle.proto.TrainerConfig) - if model is not None: - assert isinstance(model, swig_paddle.GradientMachine) - return swig_paddle.Trainer.__create__( - swig_paddle.TrainerConfig.createFromProto(config), model) - - swig_paddle.Trainer.create = staticmethod(Trainer_create) - - swig_paddle.Trainer.__getForwardOutput__ = \ - swig_paddle.Trainer.getForwardOutput - - def getForwardOutput(self): - """ - Get the netword outputs from the previous trainOneBatch(), - trainOneDataBatch(), testOneDataPatch(), or forwardOneBatch() call. - - :return: list of dictionary with keys ['id', 'value'], each value is a - numpy.ndarray. - """ - outArgs = self.__getForwardOutput__() - return [ - __arguments_to_numpy__(i, outArgs) - for i in xrange(outArgs.getSlotNum()) - ] - - swig_paddle.Trainer.getForwardOutput = getForwardOutput - - -def monkeypatches(): - patches = [ - __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__, - __monkey_patch_protobuf_objects__, __monkey_patch_parameter__, - __monkey_patch_trainer__ - ] - for patch in patches: - patch() diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md index 2772224506..dd3242f62b 100644 --- a/paddle/scripts/README.md +++ b/paddle/scripts/README.md @@ -68,7 +68,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" | `WITH_TESTING` | OFF | Build unit tests binaries. | | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. | | `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. | -| `WITH_SWIG_PY` | ON | Build with SWIG python API support. | | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. | | `WITH_STYLE_CHECK` | ON | Check the code style when building. | | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu | diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index cbd39d7a5d..a06952782b 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -180,7 +180,6 @@ function cmake_gen() { -DWITH_GOLANG=${WITH_GOLANG:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DWITH_PYTHON=${WITH_PYTHON:-ON} - -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake @@ -214,7 +213,6 @@ EOF -DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-OFF} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ - -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ -DWITH_TESTING=${WITH_TESTING:-ON} \ diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 37ad77549c..59e695e6fc 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -39,7 +39,6 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE}) IF(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/ - COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python @@ -48,7 +47,6 @@ ELSE(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND touch stub.cc COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python - COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python From b4ccae75c044f513d18dc08b383ca5c3ce3be41a Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 21 Jan 2019 18:37:52 +0800 Subject: [PATCH 044/123] remove legacy target in cmake/util.cmake --- cmake/util.cmake | 115 ----------------------------------------------- 1 file changed, 115 deletions(-) diff --git a/cmake/util.cmake b/cmake/util.cmake index 0dc33ce385..02667dbce6 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -53,118 +53,3 @@ function(target_circle_link_libraries TARGET_NAME) "-Wl,--end-group") endif() endfunction() - -# compile_cu_as_cpp -# Make a cu file compiled as C++ -# Arguments: Source files -macro(compile_cu_as_cpp) - foreach(s ${ARGN}) - set_source_files_properties(${s} PROPERTIES LANGUAGE CXX) - set_source_files_properties(${s} PROPERTIES COMPILE_FLAGS "-x c++") - endforeach() -endmacro() - -# link_paddle_exe -# add paddle library for a paddle executable, such as trainer, pserver. -# -# It will handle WITH_PYTHON etc. -function(link_paddle_exe TARGET_NAME) - if(WITH_RDMA) - generate_rdma_links() - endif() - - if(MOBILE_INFERENCE) - target_circle_link_libraries(${TARGET_NAME} - ARCHIVE_START - paddle_gserver - paddle_function - ARCHIVE_END - paddle_math - paddle_utils - paddle_parameter - paddle_proto - paddle_cuda - ${EXTERNAL_LIBS} - ${CMAKE_THREAD_LIBS_INIT} - ${CMAKE_DL_LIBS} - ${RDMA_LD_FLAGS} - ${RDMA_LIBS}) - else() - target_circle_link_libraries(${TARGET_NAME} - ARCHIVE_START - paddle_gserver - paddle_function - ARCHIVE_END - paddle_pserver - paddle_trainer_lib - paddle_network - paddle_math - paddle_utils - paddle_parameter - paddle_proto - paddle_cuda - paddle_optimizer - ${EXTERNAL_LIBS} - ${CMAKE_THREAD_LIBS_INIT} - ${CMAKE_DL_LIBS} - ${RDMA_LD_FLAGS} - ${RDMA_LIBS}) - endif() - - if(ANDROID) - target_link_libraries(${TARGET_NAME} log) - endif(ANDROID) - - if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB) - target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") - endif() - - add_dependencies(${TARGET_NAME} ${external_project_dependencies}) -endfunction() - -# link_paddle_test -# Link a paddle unittest for target -# TARGET_NAME: the unittest target name -# Rest Arguemnts: not used. -function(link_paddle_test TARGET_NAME) - link_paddle_exe(${TARGET_NAME}) - target_link_libraries(${TARGET_NAME} - paddle_test_main - paddle_test_util - ${GTEST_LIBRARIES}) -endfunction() - -# add_unittest_without_exec -# -# create a paddle unittest. not specifically define how to run this unittest. -# TARGET_NAME: the unittest target name, same as executable file name -# Rest Arguments: the source files to compile this unittest. -macro(add_unittest_without_exec TARGET_NAME) - add_executable(${TARGET_NAME} ${ARGN}) - link_paddle_test(${TARGET_NAME}) -endmacro() - -# add_unittest -# create a paddle unittest and just to execute this binary to make unittest. -# -# TARGET_NAME: the unittest target name, same as executable file name -# Rest Arguments: the source files to compile this unittest. -macro(add_unittest TARGET_NAME) - add_unittest_without_exec(${TARGET_NAME} ${ARGN}) - add_test(${TARGET_NAME} ${TARGET_NAME}) -endmacro() - -# add_simple_unittest -# create a paddle unittest with file name. It just compile ${TARGET_NAME}.cpp to -# ${TARGET_NAME} and then execute it. -macro(add_simple_unittest TARGET_NAME) - add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp) -endmacro() - -# Creates C resources file from files in given resource file -function(create_resources res_file output_file) - add_custom_command( - OUTPUT ${output_file} - COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file} - DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py) -endfunction() From 9353bc58ddf4a07117eabf1bcc8698731dace5ae Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 21 Jan 2019 18:38:58 +0800 Subject: [PATCH 045/123] remove legacy MOBILE_INFERENCE option --- CMakeLists.txt | 10 +--------- cmake/external/cares.cmake | 2 +- cmake/external/grpc.cmake | 2 +- cmake/external/gzstream.cmake | 4 ---- cmake/external/protobuf.cmake | 15 +-------------- cmake/external/snappy.cmake | 4 ---- cmake/external/snappystream.cmake | 4 ---- cmake/external/warpctc.cmake | 4 ---- cmake/generic.cmake | 8 +------- cmake/inference_lib.cmake | 32 +++++++++++++++---------------- 10 files changed, 20 insertions(+), 65 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b4a700f974..8136829a50 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,10 +115,6 @@ if(ANDROID OR IOS) "Disable nGraph when cross-compiling for Android and iOS" FORCE) set(WITH_GOLANG OFF CACHE STRING "Disable golang when cross-compiling for Android and iOS" FORCE) - - # Compile PaddlePaddle mobile inference library - set(MOBILE_INFERENCE ON) - add_definitions(-DPADDLE_MOBILE_INFERENCE) endif() if (APPLE) @@ -142,11 +138,7 @@ set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING "A path setting fluid inference shared and static libraries") -if(MOBILE_INFERENCE) - set(THIRD_PARTY_BUILD_TYPE MinSizeRel) -else() - set(THIRD_PARTY_BUILD_TYPE Release) -endif() +set(THIRD_PARTY_BUILD_TYPE Release) set(WITH_MKLML ${WITH_MKL}) if (NOT DEFINED WITH_MKLDNN) diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake index a743b572a6..52507a6ae4 100644 --- a/cmake/external/cares.cmake +++ b/cmake/external/cares.cmake @@ -13,7 +13,7 @@ # limitations under the License. # -IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) +IF(NOT WITH_DISTRIBUTE) return() ENDIF() diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index fd9835d023..c5754da59b 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -13,7 +13,7 @@ # limitations under the License. # -IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) +IF(NOT WITH_DISTRIBUTE) return() ENDIF() diff --git a/cmake/external/gzstream.cmake b/cmake/external/gzstream.cmake index 3e36ef7ae2..af7a8bfda6 100644 --- a/cmake/external/gzstream.cmake +++ b/cmake/external/gzstream.cmake @@ -13,10 +13,6 @@ # limitations under the License. # -IF(MOBILE_INFERENCE) - return() -ENDIF() - include (ExternalProject) # NOTE: gzstream is needed when linking with ctr reader. diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 16fd9fac92..d0f7f7409b 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -204,15 +204,6 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") - IF(MOBILE_INFERENCE) - # The reason why the official version is not used is described in - # https://github.com/PaddlePaddle/Paddle/issues/6114 - SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git") - SET(PROTOBUF_TAG "v3.2.0") - IF(NOT BUILD_FOR_HOST) - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF") - ENDIF() - ENDIF() ExternalProject_Add( ${TARGET_NAME} @@ -240,11 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -IF(NOT MOBILE_INFERENCE) - SET(PROTOBUF_VERSION 3.1) -ELSE() - SET(PROTOBUF_VERSION 3.2) -ENDIF() +SET(PROTOBUF_VERSION 3.1) IF(CMAKE_CROSSCOMPILING) build_protobuf(protobuf_host TRUE) LIST(APPEND external_project_dependencies protobuf_host) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index f9d4cd9740..27d075336d 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -if(MOBILE_INFERENCE OR RPI) - return() -endif() - include (ExternalProject) # NOTE: snappy is needed when linking with recordio diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake index 1ec79462c1..392f186b7c 100644 --- a/cmake/external/snappystream.cmake +++ b/cmake/external/snappystream.cmake @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -IF(MOBILE_INFERENCE OR RPI) - return() -ENDIF() - include (ExternalProject) set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 7b937c93fe..7a25aaf15f 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -IF(MOBILE_INFERENCE) - return() -ENDIF() - INCLUDE(ExternalProject) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 3f1be11d85..7dd59577c4 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -655,12 +655,6 @@ function(paddle_protobuf_generate_cpp SRCS HDRS) set(${SRCS}) set(${HDRS}) - if (MOBILE_INFERENCE) - set(EXTRA_FLAG "lite:") - else() - set(EXTRA_FLAG "") - endif() - foreach(FIL ${ARGN}) get_filename_component(ABS_FIL ${FIL} ABSOLUTE) get_filename_component(FIL_WE ${FIL} NAME_WE) @@ -677,7 +671,7 @@ function(paddle_protobuf_generate_cpp SRCS HDRS) COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} -I${CMAKE_CURRENT_SOURCE_DIR} - --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} + --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} DEPENDS ${ABS_FIL} protoc COMMENT "Running C++ protocol buffer compiler on ${FIL}" VERBATIM ) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 3e11d332ff..a7dce4dfdb 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -149,25 +149,23 @@ if (WITH_NGRAPH) ) endif () -if (NOT MOBILE_INFERENCE AND NOT RPI) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") - copy(snappy_lib - SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappy) +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") +copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappy) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") - copy(snappystream_lib - SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappystream) +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") +copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappystream) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") - copy(zlib_lib - SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS zlib) -endif () +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") +copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS zlib) # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") From 2d529186f1592d3751d83c58f0818de7ec7aa0de Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 21 Jan 2019 18:46:54 +0800 Subject: [PATCH 046/123] remove legacy CMAKE_CROSSCOMPILING option --- CMakeLists.txt | 4 +- cmake/cblas.cmake | 28 +- cmake/configure.cmake | 10 +- cmake/cross_compiling/android.cmake | 236 --------------- cmake/cross_compiling/host.cmake | 49 ---- cmake/cross_compiling/ios.cmake | 347 ----------------------- cmake/cross_compiling/raspberry_pi.cmake | 84 ------ cmake/cuda.cmake | 4 +- cmake/external/openblas.cmake | 38 +-- cmake/external/protobuf.cmake | 18 +- cmake/flags.cmake | 6 +- 11 files changed, 28 insertions(+), 796 deletions(-) delete mode 100644 cmake/cross_compiling/android.cmake delete mode 100644 cmake/cross_compiling/host.cmake delete mode 100644 cmake/cross_compiling/ios.cmake delete mode 100644 cmake/cross_compiling/raspberry_pi.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 8136829a50..de62382b78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,9 +33,7 @@ if(WIN32) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") endif(WIN32) -if(NOT CMAKE_CROSSCOMPILING) - find_package(CUDA QUIET) -endif(NOT CMAKE_CROSSCOMPILING) +find_package(CUDA QUIET) find_package(Git REQUIRED) find_package(Threads REQUIRED) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 24de8d9d7c..74b1ef2122 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -64,24 +64,18 @@ endif() ## Then find the reference-cblas. www.netlib.org/blas/ set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH "Folder contains reference-cblas") -if(NOT CMAKE_CROSSCOMPILING) - set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/include - /usr/include - /usr/include/cblas - ) +set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS + ${REFERENCE_CBLAS_ROOT}/include + /usr/include + /usr/include/cblas +) - set(REFERENCE_CBLAS_LIB_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/lib - /usr/lib - /usr/lib/blas/reference/ - /usr/lib/reference/ - ) -else() - # Disable the finding of reference cblas under host's system path - set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include) - set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib) -endif() +set(REFERENCE_CBLAS_LIB_SEARCH_PATHS + ${REFERENCE_CBLAS_ROOT}/lib + /usr/lib + /usr/lib/blas/reference/ + /usr/lib/reference/ +) if(WITH_SYSTEM_BLAS) find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS diff --git a/cmake/configure.cmake b/cmake/configure.cmake index e3d856fb30..076e839120 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -49,12 +49,10 @@ if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) -if(NOT CMAKE_CROSSCOMPILING) - if(WITH_AVX AND AVX_FOUND) - set(SIMD_FLAG ${AVX_FLAG}) - elseif(SSE3_FOUND) - set(SIMD_FLAG ${SSE3_FLAG}) - endif() +if(WITH_AVX AND AVX_FOUND) + set(SIMD_FLAG ${AVX_FLAG}) +elseif(SSE3_FOUND) + set(SIMD_FLAG ${SSE3_FLAG}) endif() if(WIN32) diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake deleted file mode 100644 index 4cf2be3bdf..0000000000 --- a/cmake/cross_compiling/android.cmake +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This is a toolchain file for cross-compiling for Android, and the -# configuration refers to the open-source resposity: -# https://github.com/taka-no-me/android-cmake -# Most of the variables are compatible with that used in -# https://developer.android.com/ndk/guides/cmake.html -# The supported variables are listed belows: -# -# ANDROID_STANDALONE_TOOLCHAIN -# ANDROID_TOOLCHAIN -# ANDROID_ABI -# ANDROID_NATIVE_API_LEVEL -# ANDROID_ARM_MODE -# ANDROID_ARM_NEON -# -# For CMake >= 3.7.0, all the settings will be delivered to CMake system -# variables to let CMake do the cross-compiling configurations itself. -# More detail of cross-compiling settings -# https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html - -IF(NOT ANDROID) - return() -ENDIF() - -# check the exist of android standalone toolchain -IF(NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN) - SET(ANDROID_STANDALONE_TOOLCHAIN $ENV{ANDROID_STANDALONE_TOOLCHAIN} - CACHE PATH "Folder holds the standalone toolchain of Android NDK") -ENDIF() -IF(NOT ANDROID_STANDALONE_TOOLCHAIN) - MESSAGE(WARNING "It is recommended to set ANDROID_STANDALONE_TOOLCHAIN to " - "use a standalone toolchain.\n" - "To cross-compile for Android, you need to:\n" - "1. Download an Android NDK from" - " https://developer.android.com/ndk/downloads/index.html\n" - "2. Setup a standalone toolchain" - "https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn\n") -ENDIF() - -IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL) - IF(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$") - STRING(REPLACE "android-" "" CMAKE_SYSTEM_VERSION "${CMAKE_MATCH_0}") - ELSEIF(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$") - SET(CMAKE_SYSTEM_VERSION ${ANDROID_NATIVE_API_LEVEL}) - ENDIF() -ENDIF() - -IF(NOT DEFINED ANDROID_TOOLCHAIN) - SET(ANDROID_TOOLCHAIN clang) -ENDIF() - -IF(NOT DEFINED ANDROID_ABI) - SET(ANDROID_ABI "armeabi-v7a") -ENDIF() - -IF(NOT DEFINED ANDROID_ARM_MODE) - SET(ANDROID_ARM_MODE ON) -ENDIF() -IF(ANDROID_ARM_MODE) - SET(ANDROID_ARM_MODE_NAME "arm") -ELSE(ANDROID_ARM_MODE) - SET(ANDROID_ARM_MODE_NAME "thumb") -ENDIF(ANDROID_ARM_MODE) - -IF(NOT DEFINED ANDROID_ARM_NEON) - SET(ANDROID_ARM_NEON ON) -ENDIF() - -IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") - IF("${CMAKE_VERSION}" VERSION_LESS "3.1.0") - SET(CMAKE_SYSTEM_NAME "Linux") - ENDIF() - MESSAGE(WARNING "It is recommended to use CMake >= 3.7.0 (current version: " - "${CMAKE_VERSION}), when cross-compiling for Android.") - - IF(ANDROID_STANDALONE_TOOLCHAIN) - # Use standalone toolchain - SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot") - - IF(NOT CMAKE_SYSTEM_VERSION) - SET(ANDROID_STANDALONE_TOOLCHAIN_API "") - SET(ANDROID_API_LEVEL_H_REGEX "^[\t ]*#[\t ]*define[\t ]+__ANDROID_API__[\t ]+([0-9]+)") - FILE(STRINGS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" - ANDROID_API_LEVEL_H_CONTENT REGEX "${ANDROID_API_LEVEL_H_REGEX}") - IF(ANDROID_API_LEVEL_H_CONTENT MATCHES "${ANDROID_API_LEVEL_H_REGEX}") - SET(ANDROID_STANDALONE_TOOLCHAIN_API "${CMAKE_MATCH_1}") - ENDIF() - SET(CMAKE_SYSTEM_VERSION ${ANDROID_STANDALONE_TOOLCHAIN_API}) - ENDIF() - - # Toolchain - SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN}) - ELSE(ANDROID_NDK) - # TODO: use android ndk - ENDIF() - - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) - IF(ANDROID_ABI STREQUAL "armeabi") - SET(CMAKE_SYSTEM_PROCESSOR armv5te) - SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi) - ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") - SET(CMAKE_SYSTEM_PROCESSOR armv7-a) - SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi) - ENDIF() - ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") - SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) - SET(CMAKE_SYSTEM_PROCESSOR aarch64) - SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android) - ELSE() - MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.") - ENDIF() - SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") - - IF(ANDROID_TOOLCHAIN STREQUAL clang) - SET(ANDROID_C_COMPILER_NAME clang) - SET(ANDROID_CXX_COMPILER_NAME clang++) - SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) - SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) - ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc) - SET(ANDROID_C_COMPILER_NAME gcc) - SET(ANDROID_CXX_COMPILER_NAME g++) - ELSE() - MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}") - ENDIF() - - # C compiler - IF(NOT CMAKE_C_COMPILER) - SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}") - ELSE() - GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) - ENDIF() - IF(NOT EXISTS ${ANDROID_C_COMPILER}) - MESSAGE(FATAL_ERROR "Cannot find C compiler: ${ANDROID_C_COMPILER}") - ENDIF() - - # CXX compiler - IF(NOT CMAKE_CXX_COMPILER) - SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}") - ELSE() - GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) - ENDIF() - IF(NOT EXISTS ${ANDROID_CXX_COMPILER}) - MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${ANDROID_CXX_COMPILER}") - ENDIF() - - SET(CMAKE_C_COMPILER ${ANDROID_C_COMPILER} CACHE PATH "C compiler" FORCE) - SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) - - # Toolchain and ABI specific flags. - SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections") - SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections") - - IF(ANDROID_ABI STREQUAL "armeabi") - LIST(APPEND ANDROID_COMPILER_FLAGS - -march=armv5te - -mtune=xscale - -msoft-float) - ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") - LIST(APPEND ANDROID_COMPILER_FLAGS - -march=armv7-a - -mfloat-abi=softfp) - IF(ANDROID_ARM_NEON) - LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=neon) - ELSE() - LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16) - ENDIF() - LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8) - ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") - LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) - ENDIF() - - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - IF(ANDROID_ARM_MODE) - LIST(APPEND ANDROID_COMPILER_FLAGS -marm) - ELSE() - LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb) - ENDIF() - IF(ANDROID_TOOLCHAIN STREQUAL clang) - # Disable integrated-as for better compatibility. - LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as) - ENDIF() - ENDIF() - - IF(ANDROID_TOOLCHAIN STREQUAL clang) - # CMake automatically forwards all compiler flags to the linker, - # and clang doesn't like having -Wa flags being used for linking. - # To prevent CMake from doing this would require meddling with - # the CMAKE__COMPILE_OBJECT rules, which would get quite messy. - LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments) - ENDIF() - - STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") - STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}") - - SET(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" - CACHE STRING "C flags") - SET(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}" - CACHE STRING "CXX flags") - SET(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}" - CACHE STRING "shared linker flags") - - SET(CMAKE_POSITION_INDEPENDENT_CODE TRUE) - SET(CMAKE_EXE_LINKER_FLAGS "-pie -fPIE ${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" - CACHE STRING "executable linker flags") - - MESSAGE(STATUS "Android: Targeting API '${CMAKE_SYSTEM_VERSION}' " - "with architecture '${ANDROID_ARM_MODE_NAME}', " - "ABI '${ANDROID_ABI}', and processor '${CMAKE_SYSTEM_PROCESSOR}'") - MESSAGE(STATUS "System CMAKE_C_FLAGS: " ${CMAKE_C_FLAGS}) - MESSAGE(STATUS "System CMAKE_CXX_FLAGS: " ${CMAKE_CXX_FLAGS}) -ELSE() - IF(ANDROID_STANDALONE_TOOLCHAIN) - SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN}) - ENDIF() - SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI}) - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) - IF(ANDROID_ABI STREQUAL "armeabi-v7a") - SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) - ENDIF() - ENDIF() -ENDIF() diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake deleted file mode 100644 index f9c6b12136..0000000000 --- a/cmake/cross_compiling/host.cmake +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# find host C compiler -IF(HOST_C_COMPILER) - SET(HOST_C_COMPILER_NAME ${HOST_C_COMPILER}) -ELSEIF(NOT $ENV{CC} STREQUAL "") - SET(HOST_C_COMPILER_NAME $ENV{CC}) -ELSE() - SET(HOST_C_COMPILER_NAME cc) -ENDIF() - -GET_FILENAME_COMPONENT(HOST_C_COMPILER_PATH ${HOST_C_COMPILER_NAME} PROGRAM) -IF(NOT HOST_C_COMPILER_PATH OR NOT EXISTS ${HOST_C_COMPILER_PATH}) - MESSAGE(FATAL_ERROR "Cannot find host C compiler, set host C compiler:\n" - "\tcmake .. -DHOST_C_COMPILER=...") -ENDIF() - -# find host CXX compiler -IF(HOST_CXX_COMPILER) - SET(HOST_CXX_COMPILER_NAME ${HOST_CXX_COMPILER}) -ELSEIF(NOT $ENV{CXX} STREQUAL "") - SET(HOST_CXX_COMPILER_NAME $ENV{CXX}) -ELSE() - SET(HOST_CXX_COMPILER_NAME c++) -ENDIF() - -GET_FILENAME_COMPONENT(HOST_CXX_COMPILER_PATH ${HOST_CXX_COMPILER_NAME} PROGRAM) -IF(NOT HOST_CXX_COMPILER_PATH OR NOT EXISTS ${HOST_CXX_COMPILER_PATH}) - MESSAGE(FATAL_ERROR "Cannot find host CXX compiler, set host CXX compiler:\n" - "\tcmake .. -DHOST_CXX_COMPILER=...") -ENDIF() - -SET(HOST_C_COMPILER ${HOST_C_COMPILER_PATH} CACHE PATH "Host C compiler") -SET(HOST_CXX_COMPILER ${HOST_CXX_COMPILER_PATH} CACHE PATH "Host CXX compiler") - -MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER}) -MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER}) diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake deleted file mode 100644 index 10d389ec8e..0000000000 --- a/cmake/cross_compiling/ios.cmake +++ /dev/null @@ -1,347 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This is a toolchain file for cross-compiling for iOS, and the -# configuration largely refers to public toolchain file: -# https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake -# and -# https://github.com/cristeab/ios-cmake -# -# Supports options: -# IOS_PLATFORM = OS (default) or SIMULATOR -# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders -# OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch. -# SIMULATOR - used to build for the Simulator platforms, which have an x86 arch. -# IOS_ARCH -# The archectures wanted to support, such "arm64", "armv7;arm64" -# IOS_DEPLOYMENT_TARGET -# The minimum iOS deployment version, such as "7.0" -# IOS_ENABLE_BITCODE = ON (default) or OFF -# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON -# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder -# By default this location is automatcially chosen based on the IOS_PLATFORM value above. -# If set manually, it will override the default location and force the user of a particular Developer Platform -# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder -# By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value. -# In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path. -# If set manually, this will force the use of a specific SDK version - -# Macros: -# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE) -# A convenience macro for setting xcode specific properties on targets -# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1") -# find_host_package (PROGRAM ARGS) -# A macro used to find executable programs on the host system, not within the iOS environment. -# Thanks to the android-cmake project for providing the command - -if(NOT IOS) - return() -endif() - -set(CMAKE_SYSTEM_NAME Darwin) - -# Get the Xcode version being used. -execute_process(COMMAND xcodebuild -version - OUTPUT_VARIABLE XCODE_VERSION - RESULT_VARIABLE XCODE_VERSION_RESULT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT ${XCODE_VERSION_RESULT}) - string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}") - string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}") - message(STATUS "Building with Xcode version: ${XCODE_VERSION}") -else() - message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.") -endif() - -# Required as of cmake 2.8.10 -set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE) - -# Setup iOS platform unless specified manually with IOS_PLATFORM -if(NOT DEFINED IOS_PLATFORM) - set(IOS_PLATFORM "OS") -endif() -set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") - -# Set the architecture for iOS -if(NOT DEFINED IOS_ARCH) - if(IOS_PLATFORM STREQUAL "OS") - set(IOS_ARCH "armv7;armv7s;arm64") - elseif(IOS_PLATFORM STREQUAL "SIMULATOR") - set(IOS_ARCH "i386;x86_64") - endif() -endif() -set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") - -# Specify minimum iOS deployment version -if(NOT DEFINED IOS_DEPLOYMENT_TARGET) - set(IOS_DEPLOYMENT_TARGET "7.0") -endif() -set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version") - -# Whether to enable bitcode -if(NOT DEFINED IOS_ENABLE_BITCODE) - set(IOS_ENABLE_BITCODE ON) -endif() -set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode") - -if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS) - set(IOS_USE_VECLIB_FOR_BLAS OFF) -endif() -set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib") - -# Check the platform selection and setup for developer root -if(${IOS_PLATFORM} STREQUAL "OS") - set(IOS_PLATFORM_LOCATION "iPhoneOS.platform") - set(XCODE_IOS_PLATFORM iphoneos) - - # This causes the installers to properly locate the output libraries - set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos") -elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR") - set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform") - set(XCODE_IOS_PLATFORM iphonesimulator) - - # This causes the installers to properly locate the output libraries - set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator") -elseif(${IOS_PLATFORM} STREQUAL "WATCHOS") - set(IOS_PLATFORM_LOCATION "WatchOS.platform") - set(XCODE_IOS_PLATFORM watchos) - - # This causes the installers to properly locate the output libraries - set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos") -else(${IOS_PLATFORM} STREQUAL "OS") - message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n" - "\t OS, SIMULATOR, or WATCHOS.") -endif() - -# Check iOS developer toolchain -if(NOT DEFINED IOS_DEVELOPER_ROOT) - # Setup iOS developer location - execute_process(COMMAND xcode-select -print-path - OUTPUT_VARIABLE XCODE_DEVELOPER_DIR - RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - # Xcode 4.3 changed the installation location, choose the most recent one available - if(${XCODE_VERSION} VERSION_LESS "4.3.0") - set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer") - else() - set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer") - endif() -endif() -if(EXISTS ${IOS_DEVELOPER_ROOT}) - set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform") -else() - message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.") -endif() - -# Check iOS SDK -if(NOT DEFINED IOS_SDK_ROOT) - # Find and use the most recent iOS sdk - file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*") - if(IOS_SDK_LISTS) - list(SORT IOS_SDK_LISTS) - list(REVERSE IOS_SDK_LISTS) - list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT) - else(IOS_SDK_LISTS) - message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}." - " Please manually set IOS_SDK_ROOT or install the iOS SDK.") - endif(IOS_SDK_LISTS) -endif() -if(EXISTS ${IOS_SDK_ROOT}) - set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK") - message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}") -else() - message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.") -endif() - -# Set the sysroot default to the most recent SDK -set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support") - -# Get version of iOS SDK -execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion - OUTPUT_VARIABLE IOS_SDK_VERSION - RESULT_VARIABLE IOS_SDK_VERSION_RESULT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) -if(${IOS_SDK_VERSION_RESULT}) - string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}") -endif() -if(NOT IOS_SDK_VERSION) - message(WARNING "Cannot get SDK's version.") - set(IOS_SDK_VERSION 1) -endif() -set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION}) - -# Find the C & C++ compilers for the specified SDK. -if(NOT CMAKE_C_COMPILER) - # Default to use clang - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang - OUTPUT_VARIABLE IOS_C_COMPILER - RESULT_VARIABLE IOS_C_COMPILER_RESULT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - if(${IOS_C_COMPILER_RESULT}) - get_filename_component(IOS_C_COMPILER clang PROGRAM) - endif() -else(NOT CMAKE_C_COMPILER) - # User can set it in cmake command - get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) -endif(NOT CMAKE_C_COMPILER) -if(NOT EXISTS ${IOS_C_COMPILER}) - message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}") -endif() - -if(NOT CMAKE_CXX_COMPILER) - # Default to use clang++ - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++ - OUTPUT_VARIABLE IOS_CXX_COMPILER - RESULT_VARIABLE IOS_CXX_COMPILER_RESULT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - if(${IOS_CXX_COMPILER_RESULT}) - get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM) - endif() -else(NOT CMAKE_CXX_COMPILER) - # User can set it in cmake command - get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) -endif(NOT CMAKE_CXX_COMPILER) -if(NOT EXISTS ${IOS_CXX_COMPILER}) - message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}") -endif() - -set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE) -set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) - -set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") -set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") -set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") -set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") - -# Set iOS specific C/C++ flags -if(IOS_PLATFORM STREQUAL "OS") - if(XCODE_VERSION VERSION_LESS "7.0") - set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}") - else() - # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM. - set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") - endif() -else() - set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") -endif() - -if(IOS_ENABLE_BITCODE) - set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode") -else() - set(XCODE_IOS_BITCODE_FLAGS "") -endif() - -set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}") - -# Hidden visibilty is required for cxx on iOS -set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") -set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") - -set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") - -if(IOS_USE_VECLIB_FOR_BLAS) - # Find vecLib for iOS - set(VECLIB_SEARCH_DIRS - ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks - ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks - ) - find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers) - - include(FindPackageHandleStandardArgs) - find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR) - - if(VECLIB_FOUND) - if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*") - set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib") - message(STATUS "Found standalone vecLib.framework") - else() - set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate") - message(STATUS "Found vecLib as part of Accelerate.framework") - endif() - - endif() -endif() - -set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}") -set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}") - -set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) -if(NOT IOS_ENABLE_BITCODE) - set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names") - set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names") -else() - set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib") - set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle") -endif() -set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") -set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") -set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a") - -# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree -# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache -# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun) -# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex -if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) - find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) -endif() - -# Set the find root to the iOS developer roots and to user defined paths -set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} - CACHE string "iOS find search path root") - -# default to searching for frameworks first -set(CMAKE_FIND_FRAMEWORK FIRST) - -# set up the default search directories for frameworks -set(CMAKE_SYSTEM_FRAMEWORK_PATH - ${IOS_SDK_ROOT}/System/Library/Frameworks - ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks - ${IOS_SDK_ROOT}/Developer/Library/Frameworks - ) - -# only search the iOS sdks, not the remainder of the host filesystem -set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) - -message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', " - "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'") -message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}") -message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") - -# Used in ExternalProject command -string(REPLACE ";" "\\$" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}") -set(EXTERNAL_OPTIONAL_ARGS - -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT} - -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES}) - -# This little macro lets you set any XCode specific property -macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE) - set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE}) -endmacro(set_xcode_property) - -# This macro lets you find executable programs on the host system -macro(find_host_package) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) - set(IOS FALSE) - - find_package(${ARGN}) - - set(IOS TRUE) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -endmacro(find_host_package) diff --git a/cmake/cross_compiling/raspberry_pi.cmake b/cmake/cross_compiling/raspberry_pi.cmake deleted file mode 100644 index 0425b2ae15..0000000000 --- a/cmake/cross_compiling/raspberry_pi.cmake +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This is a toolchain file for cross-compiling for Raspberry Pi. -# The supported variables are listed belows: -# -# RPI_TOOLCHAIN -# RPI_ARM_NEON -# -# Also you can set CMAKE_C/CXX_COMPILER yourself, through cmake arguments. - -IF(NOT RPI) - return() -ENDIF() - -SET(CMAKE_SYSTEM_NAME Linux) -SET(CMAKE_SYSTEM_VERSION 1) -SET(CMAKE_SYSTEM_PROCESSOR arm) - -# check the exist of raspberry pi toolchain -IF(NOT DEFINED RPI_TOOLCHAIN) - SET(RPI_TOOLCHAIN $ENV{RPI_TOOLCHAIN} - CACHE PATH "Folder holds the toolchain of Raspberr Pi") -ENDIF() -IF(NOT RPI_TOOLCHAIN) - MESSAGE(WARNING "It is recommended to set RPI_TOOLCHAIN to use toolchain.\n" - "To cross-compile for Raspberry Pi, you need to download the tools using:\n" - " git clone https://github.com/raspberrypi/tools\n") -ENDIF() - -IF(NOT DEFINED RPI_ARM_NEON) - SET(RPI_ARM_NEON ON) -ENDIF() - -IF(RPI_TOOLCHAIN) - SET(RPI_TOOLCHAIN_ROOT ${RPI_TOOLCHAIN}) - IF(RPI_TOOLCHAIN_ROOT MATCHES "gcc-linaro-arm-linux-gnueabihf-raspbian(-x64)?$") - # gcc-linaro-arm-linux-gnueabihf-raspbian - # gcc-linaro-arm-linux-gnueabihf-raspbian-x64 - SET(RPI_TOOLCHAIN_NAME arm-linux-gnueabihf) - ENDIF() - SET(RPI_TOOLCHAIN_PREFIX "${RPI_TOOLCHAIN_ROOT}/bin/${RPI_TOOLCHAIN_NAME}-") -ENDIF() - -# C compiler -IF(NOT CMAKE_C_COMPILER) - SET(RPI_C_COMPILER "${RPI_TOOLCHAIN_PREFIX}gcc") -ELSE() - GET_FILENAME_COMPONENT(RPI_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) -ENDIF() -IF(NOT EXISTS ${RPI_C_COMPILER}) - MESSAGE(FATAL_ERROR "Cannot find C compiler: ${RPI_C_COMPILER}") -ENDIF() - -# CXX compiler -IF(NOT CMAKE_CXX_COMPILER) - SET(RPI_CXX_COMPILER "${RPI_TOOLCHAIN_PREFIX}g++") -ELSE() - GET_FILENAME_COMPONENT(RPI_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) -ENDIF() -IF(NOT EXISTS ${RPI_CXX_COMPILER}) - MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${RPI_CXX_COMPILER}") -ENDIF() - -SET(CMAKE_C_COMPILER ${RPI_C_COMPILER} CACHE PATH "C compiler" FORCE) -SET(CMAKE_CXX_COMPILER ${RPI_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) - -IF(RPI_ARM_NEON) - SET(RPI_C_FLAGS "${RPI_C_FLAGS} -mfpu=neon") -ENDIF() - -SET(CMAKE_C_FLAGS "${RPI_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") -SET(CMAKE_CXX_FLAGS "${RPI_C_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 16432ce2b8..ea46f6418e 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -63,9 +63,7 @@ function(select_nvcc_arch_flags out_variable) # List of arch names set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") set(archs_name_default "All") - if(NOT CMAKE_CROSSCOMPILING) - list(APPEND archs_names "Auto") - endif() + list(APPEND archs_names "Auto") # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 019745aad0..b347a59292 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -40,38 +40,12 @@ IF(NOT ${CBLAS_FOUND}) SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_COMMIT "v0.2.20") - IF(CMAKE_CROSSCOMPILING) - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) - GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY) - SET(CROSS_SUFFIX ${CROSS_SUFFIX}/) - IF(ANDROID) - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - # use softfp - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) - ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) - ENDIF() - ELSEIF(IOS) - IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") - SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") - SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) - ELSE() - MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " - "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") - ENDIF() - ELSEIF(RPI) - # use hardfp - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0) - ENDIF() - ELSE() - IF(APPLE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") - ENDIF() - SET(OPTIONAL_ARGS "") - IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") - SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) - ENDIF() + IF(APPLE) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") + ENDIF() + SET(OPTIONAL_ARGS "") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") + SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) ENDIF() SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index d0f7f7409b..e05b7694dd 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -232,14 +232,6 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ENDFUNCTION() SET(PROTOBUF_VERSION 3.1) -IF(CMAKE_CROSSCOMPILING) - build_protobuf(protobuf_host TRUE) - LIST(APPEND external_project_dependencies protobuf_host) - - SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE} - CACHE FILEPATH "protobuf executable." FORCE) -ENDIF() - IF(NOT PROTOBUF_FOUND) build_protobuf(extern_protobuf FALSE) @@ -253,11 +245,7 @@ IF(NOT PROTOBUF_FOUND) SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE) - IF(CMAKE_CROSSCOMPILING) - PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf) - ELSE() - SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} - CACHE FILEPATH "protobuf executable." FORCE) - PROMPT_PROTOBUF_LIB(extern_protobuf) - ENDIF() + SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} + CACHE FILEPATH "protobuf executable." FORCE) + PROMPT_PROTOBUF_LIB(extern_protobuf) ENDIF(NOT PROTOBUF_FOUND) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index c4472040ce..9e6c47f016 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -156,10 +156,8 @@ set(GPU_COMMON_FLAGS endif(NOT WIN32) if (APPLE) - if(NOT CMAKE_CROSSCOMPILING) - # On Mac OS X build fat binaries with x86_64 architectures by default. - set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) - endif() + # On Mac OS X build fat binaries with x86_64 architectures by default. + set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 set (COMMON_FLAGS -Wno-deprecated-register) endif(APPLE) From 3ce10dba15f7d0ac6f3a4e45e59550ac58563eff Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 21 Jan 2019 18:56:16 +0800 Subject: [PATCH 047/123] remove legacy USE_NNPACK option --- CMakeLists.txt | 6 ------ cmake/external/nnpack.cmake | 30 ------------------------------ 2 files changed, 36 deletions(-) delete mode 100644 cmake/external/nnpack.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index de62382b78..37bc1743e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,6 @@ option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) -option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) @@ -283,11 +282,6 @@ if(WITH_MKLDNN) list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) endif() -if(USE_NNPACK) - include(external/nnpack) - list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS}) -endif(USE_NNPACK) - set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") diff --git a/cmake/external/nnpack.cmake b/cmake/external/nnpack.cmake deleted file mode 100644 index d42bcb0f32..0000000000 --- a/cmake/external/nnpack.cmake +++ /dev/null @@ -1,30 +0,0 @@ -# Find the NNPACK library -# NNPACK_ROOT - where to find NNPACK include and library. -# - -set(NNPACK_FOUND OFF) -set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK") -find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include) -find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib) -find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib) -find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib) -find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib) - -if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB) - set(NNPACK_FOUND ON) - INCLUDE_DIRECTORIES(${NNPACK_INC_DIR}) - - set(NNPACK_LIBS) - list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB}) - if (NNPACK_UKERNELS_LIB) - list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB}) - endif() - if (NNPACK_CPUFEATURES_LIB) - list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB}) - endif() - if(NOT ANDROID) - list(APPEND NNPACK_LIBS "rt") - endif() -else() - message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})") -endif() From cf29ea1592017ef037585fcc89151c159ba64317 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 21 Jan 2019 19:08:26 +0800 Subject: [PATCH 048/123] remove legacy ANDROID option --- CMakeLists.txt | 23 --------------- Dockerfile.android | 42 --------------------------- cmake/external/glog.cmake | 10 ++----- cmake/external/libxsmm.cmake | 2 +- cmake/generic.cmake | 4 +-- cmake/system.cmake | 15 ---------- paddle/fluid/pybind/CMakeLists.txt | 4 +-- paddle/scripts/README.md | 1 - paddle/scripts/paddle_docker_build.sh | 3 -- 9 files changed, 7 insertions(+), 97 deletions(-) delete mode 100644 Dockerfile.android diff --git a/CMakeLists.txt b/CMakeLists.txt index 37bc1743e2..9ec632e206 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,29 +91,6 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() -if(ANDROID OR IOS) - if(ANDROID) - if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") - message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16") - endif() - endif() - - set(WITH_GPU OFF CACHE STRING - "Disable GPU when cross-compiling for Android and iOS" FORCE) - set(WITH_AVX OFF CACHE STRING - "Disable AVX when cross-compiling for Android and iOS" FORCE) - set(WITH_PYTHON OFF CACHE STRING - "Disable PYTHON when cross-compiling for Android and iOS" FORCE) - set(WITH_RDMA OFF CACHE STRING - "Disable RDMA when cross-compiling for Android and iOS" FORCE) - set(WITH_MKL OFF CACHE STRING - "Disable MKL when cross-compiling for Android and iOS" FORCE) - set(WITH_NGRAPH OFF CACHE STRING - "Disable nGraph when cross-compiling for Android and iOS" FORCE) - set(WITH_GOLANG OFF CACHE STRING - "Disable golang when cross-compiling for Android and iOS" FORCE) -endif() - if (APPLE) set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE) diff --git a/Dockerfile.android b/Dockerfile.android deleted file mode 100644 index 48db2efea2..0000000000 --- a/Dockerfile.android +++ /dev/null @@ -1,42 +0,0 @@ -FROM ubuntu:16.04 -MAINTAINER PaddlePaddle Authors - -ARG UBUNTU_MIRROR -RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' - -# ENV variables -ARG ANDROID_ABI -ARG ANDROID_API - -ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} -ENV ANDROID_API=${ANDROID_API:-21} - -ENV HOME=/root \ - ANDROID_NDK_HOME=/opt/android-ndk-linux \ - ANDROID_TOOLCHAINS_DIR=/opt/toolchains - -RUN apt-get update && \ - apt-get install -y \ - git python-dev python-pip python-numpy \ - wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \ - apt-get clean -y - -# git credential to skip password typing -RUN git config --global credential.helper store - -# Fix locales to en_US.UTF-8 -RUN localedef -i en_US -f UTF-8 en_US.UTF-8 - -RUN pip install --upgrade pip==9.0.3 && \ - pip install -U 'protobuf==3.1.0' && \ - pip install -U wheel sphinx && \ - pip install pre-commit - -# Android NDK -RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \ - mkdir -p /opt/android-ndk-tmp && \ - cd /opt/android-ndk-tmp && \ - wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ - unzip -q android-ndk-r14b-linux-x86_64.zip && \ - mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ - rm -rf /opt/android-ndk-tmp diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 72a2f60191..7a6a452388 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -26,14 +26,8 @@ ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) -IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") - # Using the unofficial glog for Android API < 21 - SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git") - SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8") -ELSE() - SET(GLOG_REPOSITORY "https://github.com/google/glog.git") - SET(GLOG_TAG "v0.3.5") -ENDIF() +SET(GLOG_REPOSITORY "https://github.com/google/glog.git") +SET(GLOG_TAG "v0.3.5") ExternalProject_Add( extern_glog diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake index 530f7ebe28..05fb94a727 100644 --- a/cmake/external/libxsmm.cmake +++ b/cmake/external/libxsmm.cmake @@ -19,7 +19,7 @@ IF(NOT WITH_LIBXSMM) return() ENDIF() -IF(WIN32 OR APPLE OR ANDROID OR IOS) +IF(WIN32 OR APPLE) MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.") SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE) return() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 7dd59577c4..1f4dbe0b49 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -90,11 +90,11 @@ # including binary directory for generated headers. include_directories(${CMAKE_CURRENT_BINARY_DIR}) -if(NOT APPLE AND NOT ANDROID) +if(NOT APPLE) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") -endif(NOT APPLE AND NOT ANDROID) +endif(NOT APPLE) set_property(GLOBAL PROPERTY FLUID_MODULES "") # find all fluid modules is used for paddle fluid static library diff --git a/cmake/system.cmake b/cmake/system.cmake index c91ef91127..65db05bebe 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -74,21 +74,6 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}") MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") -# configuration for cross-compiling -IF(DEFINED CMAKE_SYSTEM_NAME) - INCLUDE(cross_compiling/host) - IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android") - SET(ANDROID TRUE) - INCLUDE(cross_compiling/android) - ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi") - SET(RPI TRUE) - INCLUDE(cross_compiling/raspberry_pi) - ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - SET(IOS TRUE) - INCLUDE(cross_compiling/ios) - ENDIF() -ENDIF() - # external dependencies log output SET(EXTERNAL_PROJECT_LOG_ARGS LOG_DOWNLOAD 0 # Wrap download in script to log output diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 9a91ea38ca..5cc79b9c23 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -17,9 +17,9 @@ if(WITH_PYTHON) SRCS ${PYBIND_SRCS} DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - if(NOT APPLE AND NOT ANDROID AND NOT WIN32) + if(NOT APPLE AND NOT WIN32) target_link_libraries(paddle_pybind rt) - endif(NOT APPLE AND NOT ANDROID AND NOT WIN32) + endif(NOT APPLE AND NOT WIN32) endif(WITH_AMD_GPU) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md index dd3242f62b..6c608fce3c 100644 --- a/paddle/scripts/README.md +++ b/paddle/scripts/README.md @@ -40,7 +40,6 @@ The lastest pre-built build environment images are: | Image | Tag | | ----- | --- | | paddlepaddle/paddle | latest-dev | -| paddlepaddle/paddle | latest-dev-android | ### Start Build diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh index 9a098dbbc6..91ca8907c7 100755 --- a/paddle/scripts/paddle_docker_build.sh +++ b/paddle/scripts/paddle_docker_build.sh @@ -66,9 +66,6 @@ function main() { DOCKER_REPO="paddlepaddle/paddle" VERSION="latest-dev" PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )" - if [ "$1" == "build_android" ]; then - VERSION="latest-dev-android" - fi IMG=${DOCKER_REPO}:${VERSION} start_build_docker $@ } From df92d05ef369368f473a80b32aece44073db3986 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 21 Jan 2019 19:13:41 +0800 Subject: [PATCH 049/123] remove legacy IOS option test=develop --- cmake/cblas.cmake | 7 ------- cmake/external/libxsmm.cmake | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 74b1ef2122..52ac31d1d1 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -92,10 +92,3 @@ if(WITH_SYSTEM_BLAS) message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") endif() endif() - -if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND) - set(CBLAS_FOUND ON) - set(CBLAS_PROVIDER vecLib) - set(CBLAS_INC_DIR ${VECLIB_INC_DIR}) - add_definitions(-DPADDLE_USE_VECLIB) -endif() diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake index 05fb94a727..39f49d210a 100644 --- a/cmake/external/libxsmm.cmake +++ b/cmake/external/libxsmm.cmake @@ -20,7 +20,7 @@ IF(NOT WITH_LIBXSMM) ENDIF() IF(WIN32 OR APPLE) - MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.") + MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.") SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE) return() ENDIF() From 9449844c2a37081d41e947920d2480856b69d3f7 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 21 Jan 2019 19:31:00 +0800 Subject: [PATCH 050/123] update ctr_reader in API.spec test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 82307b4229..e73b76f8b5 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -356,7 +356,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_indexs', 'sparse_slot_indexs', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None) From e6a3a3a31a672994054dae4c9e23a712ad206180 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 21 Jan 2019 19:47:34 +0800 Subject: [PATCH 051/123] fix pr 15313 test=develop --- paddle/fluid/operators/group_norm_op.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 6e460c470b..3bf8586254 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -21,20 +21,20 @@ namespace operators { enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; -#define CHECK_CASE(i, flags, kernel_name, args...) \ - if (i == flags) { \ - kernel_name<<>>(args); \ +#define CHECK_CASE(i, flags, kernel_name, ...) \ + if (i == flags) { \ + kernel_name<<>>(__VA_ARGS__); \ } // 0 for no scale, no bias // 1 for has scale, no bias // 2 for no scale, has bias // 3 for has scale, has bias -#define UNROLL_ALL_CASES(flags, kernel_name, args...) \ - CHECK_CASE(0, flags, kernel_name, args) \ - CHECK_CASE(1, flags, kernel_name, args) \ - CHECK_CASE(2, flags, kernel_name, args) \ - CHECK_CASE(3, flags, kernel_name, args) +#define UNROLL_ALL_CASES(flags, kernel_name, ...) \ + CHECK_CASE(0, flags, kernel_name, __VA_ARGS__) \ + CHECK_CASE(1, flags, kernel_name, __VA_ARGS__) \ + CHECK_CASE(2, flags, kernel_name, __VA_ARGS__) \ + CHECK_CASE(3, flags, kernel_name, __VA_ARGS__) template __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { From 54c0da080dccb843e6a21f0acbcfbf5ffd3a3f86 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 21 Jan 2019 20:04:18 +0800 Subject: [PATCH 052/123] fix compiler error in paddle_build.sh test=develop --- paddle/scripts/paddle_build.sh | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a06952782b..cda04451f5 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -709,10 +709,9 @@ function gen_fluid_lib() { Generating fluid library for train and inference ... ======================================== EOF - cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON - make -j `nproc` fluid_lib_dist - make -j `nproc` inference_lib_dist - fi + cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON + make -j `nproc` fluid_lib_dist + make -j `nproc` inference_lib_dist } function tar_fluid_lib() { @@ -721,12 +720,11 @@ function tar_fluid_lib() { Taring fluid library for train and inference ... ======================================== EOF - cd ${PADDLE_ROOT}/build - cp -r fluid_install_dir fluid - tar -czf fluid.tgz fluid - cp -r fluid_inference_install_dir fluid_inference - tar -czf fluid_inference.tgz fluid_inference - fi + cd ${PADDLE_ROOT}/build + cp -r fluid_install_dir fluid + tar -czf fluid.tgz fluid + cp -r fluid_inference_install_dir fluid_inference + tar -czf fluid_inference.tgz fluid_inference } function test_fluid_lib() { @@ -735,12 +733,11 @@ function test_fluid_lib() { Testing fluid library for inference ... ======================================== EOF - cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci - ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \ - ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \ - ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib} - ./clean.sh - fi + cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci + ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \ + ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \ + ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib} + ./clean.sh } function main() { From d60751fb71490b77bbf5b90faa87fcf4bb01670b Mon Sep 17 00:00:00 2001 From: flame Date: Mon, 21 Jan 2019 20:58:17 +0800 Subject: [PATCH 053/123] add python inference api (#15248) add python inference api --- paddle/fluid/API.spec | 1 + .../fluid/inference/api/analysis_predictor.h | 2 +- paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/inference_api.cc | 256 ++++++++++++++++++ paddle/fluid/pybind/inference_api.h | 23 ++ paddle/fluid/pybind/pybind.cc | 3 +- python/paddle/fluid/compiler.py | 29 +- python/paddle/fluid/executor.py | 7 + .../paddle/fluid/tests/book/test_word2vec.py | 29 +- 9 files changed, 346 insertions(+), 9 deletions(-) create mode 100644 paddle/fluid/pybind/inference_api.cc create mode 100644 paddle/fluid/pybind/inference_api.h diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 0a4edea2c3..ad39542b4d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -45,6 +45,7 @@ paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], vararg paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index e25b5a7047..9095b6ec1a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -45,6 +45,7 @@ using contrib::AnalysisConfig; class AnalysisPredictor : public PaddlePredictor { public: explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} + ~AnalysisPredictor(); bool Init(const std::shared_ptr &parent_scope, const std::shared_ptr &program = nullptr); @@ -95,7 +96,6 @@ class AnalysisPredictor : public PaddlePredictor { template void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); - ~AnalysisPredictor(); // Some more detailed tests, they are made the friends of the predictor, so that // the all the details can be tested. diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 9a91ea38ca..67b2386813 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,10 +1,11 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool - tracer) + tracer analysis_predictor) + if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc new file mode 100644 index 0000000000..2624702666 --- /dev/null +++ b/paddle/fluid/pybind/inference_api.cc @@ -0,0 +1,256 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/pybind/inference_api.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { +using paddle::PaddleDType; +using paddle::PaddleBuf; +using paddle::PaddleTensor; +using paddle::PaddlePlace; +using paddle::PaddlePredictor; +using paddle::NativeConfig; +using paddle::NativePaddlePredictor; +using paddle::AnalysisPredictor; +using paddle::contrib::AnalysisConfig; + +static void BindPaddleDType(py::module *m); +static void BindPaddleBuf(py::module *m); +static void BindPaddleTensor(py::module *m); +static void BindPaddlePlace(py::module *m); +static void BindPaddlePredictor(py::module *m); +static void BindNativeConfig(py::module *m); +static void BindNativePredictor(py::module *m); +static void BindAnalysisConfig(py::module *m); +static void BindAnalysisPredictor(py::module *m); + +void BindInferenceApi(py::module *m) { + BindPaddleDType(m); + BindPaddleBuf(m); + BindPaddleTensor(m); + BindPaddlePlace(m); + BindPaddlePredictor(m); + BindNativeConfig(m); + BindNativePredictor(m); + BindAnalysisConfig(m); + BindAnalysisPredictor(m); + + m->def("create_paddle_predictor", + &paddle::CreatePaddlePredictor); + m->def("create_paddle_predictor", + &paddle::CreatePaddlePredictor); + m->def("paddle_dtype_size", &paddle::PaddleDtypeSize); +} + +void BindPaddleDType(py::module *m) { + py::enum_(*m, "PaddleDType") + .value("FLOAT32", PaddleDType::FLOAT32) + .value("INT64", PaddleDType::INT64); +} + +void BindPaddleBuf(py::module *m) { + py::class_(*m, "PaddleBuf") + .def(py::init()) + .def(py::init([](std::vector &data) { + auto buf = PaddleBuf(data.size() * sizeof(float)); + std::memcpy(buf.data(), static_cast(data.data()), buf.length()); + return std::move(buf); + })) + .def(py::init([](std::vector &data) { + auto buf = PaddleBuf(data.size() * sizeof(int64_t)); + std::memcpy(buf.data(), static_cast(data.data()), buf.length()); + return std::move(buf); + })) + .def("resize", &PaddleBuf::Resize) + .def("reset", + [](PaddleBuf &self, std::vector &data) { + self.Resize(data.size() * sizeof(float)); + std::memcpy(self.data(), data.data(), self.length()); + }) + .def("reset", + [](PaddleBuf &self, std::vector &data) { + self.Resize(data.size() * sizeof(int64_t)); + std::memcpy(self.data(), data.data(), self.length()); + }) + .def("empty", &PaddleBuf::empty) + .def("float_data", + [](PaddleBuf &self) -> std::vector { + auto *data = static_cast(self.data()); + return {data, data + self.length() / sizeof(*data)}; + }) + .def("int64_data", + [](PaddleBuf &self) -> std::vector { + int64_t *data = static_cast(self.data()); + return {data, data + self.length() / sizeof(*data)}; + }) + .def("length", &PaddleBuf::length); +} + +void BindPaddleTensor(py::module *m) { + py::class_(*m, "PaddleTensor") + .def(py::init<>()) + .def_readwrite("name", &PaddleTensor::name) + .def_readwrite("shape", &PaddleTensor::shape) + .def_readwrite("data", &PaddleTensor::data) + .def_readwrite("dtype", &PaddleTensor::dtype) + .def_readwrite("lod", &PaddleTensor::lod); +} + +void BindPaddlePlace(py::module *m) { + py::enum_(*m, "PaddlePlace") + .value("UNK", PaddlePlace::kUNK) + .value("CPU", PaddlePlace::kCPU) + .value("GPU", PaddlePlace::kGPU); +} + +void BindPaddlePredictor(py::module *m) { + auto paddle_predictor = py::class_(*m, "PaddlePredictor"); + paddle_predictor + .def("run", + [](PaddlePredictor &self, const std::vector &inputs) { + std::vector outputs; + self.Run(inputs, &outputs); + return outputs; + }) + .def("get_input_tensor", &PaddlePredictor::GetInputTensor) + .def("get_output_tensor", &PaddlePredictor::GetOutputTensor) + .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun) + .def("clone", &PaddlePredictor::Clone); + + auto config = py::class_(paddle_predictor, "Config"); + config.def(py::init<>()) + .def_readwrite("model_dir", &PaddlePredictor::Config::model_dir); +} + +void BindNativeConfig(py::module *m) { + py::class_(*m, "NativeConfig") + .def(py::init<>()) + .def_readwrite("use_gpu", &NativeConfig::use_gpu) + .def_readwrite("device", &NativeConfig::device) + .def_readwrite("fraction_of_gpu_memory", + &NativeConfig::fraction_of_gpu_memory) + .def_readwrite("prog_file", &NativeConfig::prog_file) + .def_readwrite("param_file", &NativeConfig::param_file) + .def_readwrite("specify_input_name", &NativeConfig::specify_input_name) + .def("set_cpu_math_library_num_threads", + &NativeConfig::SetCpuMathLibraryNumThreads) + .def("cpu_math_library_num_threads", + &NativeConfig::cpu_math_library_num_threads); +} + +void BindNativePredictor(py::module *m) { + py::class_(*m, + "NativePaddlePredictor") + .def(py::init()) + .def("init", &NativePaddlePredictor::Init) + .def("run", + [](NativePaddlePredictor &self, + const std::vector &inputs) { + std::vector outputs; + self.Run(inputs, &outputs); + return outputs; + }) + .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor) + .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor) + .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun) + .def("clone", &NativePaddlePredictor::Clone) + .def("scope", &NativePaddlePredictor::scope, + py::return_value_policy::reference); +} + +void BindAnalysisConfig(py::module *m) { + py::class_(*m, "AnalysisConfig") + .def(py::init()) + .def(py::init()) + .def(py::init()) + .def("set_model", (void (AnalysisConfig::*)(const std::string &)) & + AnalysisConfig::SetModel) + .def("set_model", (void (AnalysisConfig::*)(const std::string &, + const std::string &)) & + AnalysisConfig::SetModel) + .def("set_prog_file", &AnalysisConfig::SetProgFile) + .def("set_params_file", &AnalysisConfig::SetParamsFile) + .def("model_dir", &AnalysisConfig::model_dir) + .def("prog_file", &AnalysisConfig::prog_file) + .def("params_file", &AnalysisConfig::params_file) + .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu, + py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0) + .def("disable_gpu", &AnalysisConfig::DisableGpu) + .def("use_gpu", &AnalysisConfig::use_gpu) + .def("gpu_device_id", &AnalysisConfig::gpu_device_id) + .def("memory_pool_init_size_mb", + &AnalysisConfig::memory_pool_init_size_mb) + .def("fraction_of_gpu_memory_for_pool", + &AnalysisConfig::fraction_of_gpu_memory_for_pool) + .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim, + py::arg("x") = true) + .def("ir_optim", &AnalysisConfig::ir_optim) + .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps, + py::arg("x") = true) + .def("use_feed_fetch_ops_enabled", + &AnalysisConfig::use_feed_fetch_ops_enabled) + .def("switch_specify_input_names", + &AnalysisConfig::SwitchSpecifyInputNames, py::arg("x") = true) + .def("specify_input_name", &AnalysisConfig::specify_input_name) + .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, + py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, + py::arg("min_subgraph_size") = 3) + .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) + .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, + py::arg("x") = true) + .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN) + .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled) + .def("set_cpu_math_library_num_threads", + &AnalysisConfig::SetCpuMathLibraryNumThreads) + .def("cpu_math_library_num_threads", + &AnalysisConfig::cpu_math_library_num_threads) + .def("to_native_config", &AnalysisConfig::ToNativeConfig) + .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp) + .def("set_model_buffer", &AnalysisConfig::SetModelBuffer) + .def("model_from_memory", &AnalysisConfig::model_from_memory) + .def("pass_builder", &AnalysisConfig::pass_builder, + py::return_value_policy::reference); +} + +void BindAnalysisPredictor(py::module *m) { + py::class_(*m, "AnalysisPredictor") + .def(py::init()) + .def("init", &AnalysisPredictor::Init) + .def( + "run", + [](AnalysisPredictor &self, const std::vector &inputs) { + std::vector outputs; + self.Run(inputs, &outputs); + return outputs; + }) + .def("get_input_tensor", &AnalysisPredictor::GetInputTensor) + .def("get_output_tensor", &AnalysisPredictor::GetOutputTensor) + .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun) + .def("clone", &AnalysisPredictor::Clone) + .def("scope", &AnalysisPredictor::scope, + py::return_value_policy::reference); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/inference_api.h b/paddle/fluid/pybind/inference_api.h new file mode 100644 index 0000000000..c2adfbecf7 --- /dev/null +++ b/paddle/fluid/pybind/inference_api.h @@ -0,0 +1,23 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace paddle { +namespace pybind { +void BindInferenceApi(pybind11::module *m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 96d0d16bf7..b086c21898 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -49,6 +49,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/inference_api.h" #include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT @@ -1083,9 +1084,9 @@ All parameter, weight, gradient are variables in Paddle. BindRecordIOWriter(&m); BindAsyncExecutor(&m); - BindGraph(&m); BindNode(&m); + BindInferenceApi(&m); } } // namespace pybind } // namespace paddle diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 8bdd03fd50..a35a4c5983 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -24,6 +24,8 @@ __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy'] ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy +InferNativeConfig = core.NativeConfig +InferAnalysisConfig = core.AnalysisConfig def _place_obj(place): @@ -70,6 +72,7 @@ class CompiledProgram(object): self._executor = None self._compiled = False self._is_data_parallel = False + self._is_inference = False def with_data_parallel(self, loss_name=None, @@ -109,10 +112,24 @@ class CompiledProgram(object): self._build_strategy = BuildStrategy() return self - def _with_distributed(self): - raise NotImplementedError() + def with_inference_optimize(self, config): + """ Add inference optimize + + Args: + config: instance of `NativeConfig` or `AnalysisConfig` to create predictor + Returns: + self + """ + assert any([ + isinstance(config, InferNativeConfig), + isinstance(config, InferAnalysisConfig) + ]) + self._is_data_parallel = False + self._is_inference = True + self._infer_config = config + return self - def _with_inference_optimize(self): + def _with_distributed(self): raise NotImplementedError() def _compile_data_parallel(self): @@ -177,6 +194,10 @@ class CompiledProgram(object): if self._loss_name else six.u(''), self._scope, self._local_scopes, self._exec_strategy, self._build_strategy) + def _compile_inference(self): + assert self._is_data_parallel is False + return core.create_paddle_predictor(self._infer_config) + def _compile(self, scope, place): """Compile the program based on the configs. @@ -200,6 +221,8 @@ class CompiledProgram(object): self._place = place if self._is_data_parallel: self._executor = self._compile_data_parallel() + elif self._is_inference: + self._executor = self._compile_inference() else: p = _place_obj(self._place) self._executor = core.Executor(p) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 0d06d0f2c9..20aa6054fe 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -27,6 +27,8 @@ from .. import compat as cpt __all__ = ['Executor', 'global_scope', 'scope_guard'] g_scope = core.Scope() +InferNativeConfig = core.NativeConfig +InferAnalysisConfig = core.AnalysisConfig def global_scope(): @@ -533,6 +535,8 @@ class Executor(object): fetch_list=fetch_list, fetch_var_name=fetch_var_name, return_numpy=return_numpy) + elif program._is_inference: + return self._run_inference(program, feed) else: # TODO(panyx0718): Can compile program to optimize executor # performance. @@ -590,3 +594,6 @@ class Executor(object): if return_numpy: outs = as_numpy(outs) return outs + + def _run_inference(self, program, feed): + return self.executor.run(feed) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index e24a9aa989..48cb778927 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -195,9 +195,34 @@ def infer(use_cuda, save_dirname=None): }, fetch_list=fetch_targets, return_numpy=False) - print(results[0].recursive_sequence_lengths()) + + def to_infer_tensor(lod_tensor): + infer_tensor = fluid.core.PaddleTensor() + infer_tensor.lod = lod_tensor.lod() + infer_tensor.data = fluid.core.PaddleBuf(np.array(lod_tensor)) + infer_tensor.shape = lod_tensor.shape() + infer_tensor.dtype = fluid.core.PaddleDType.INT64 + return infer_tensor + + infer_inputs = [first_word, second_word, third_word, fourth_word] + infer_inputs = [to_infer_tensor(t) for t in infer_inputs] + + infer_config = fluid.core.NativeConfig() + infer_config.model_dir = 'word2vec.inference.model' + infer_config.use_gpu = use_cuda + if use_cuda: + infer_config.device = 0 + infer_config.fraction_of_gpu_memory = 0.15 + compiled_program = fluid.compiler.CompiledProgram(inference_program) + compiled_program.with_inference_optimize(infer_config) + assert compiled_program._is_inference is True + infer_outputs = exe.run(compiled_program, feed=infer_inputs) np_data = np.array(results[0]) - print("Inference Shape: ", np_data.shape) + infer_out = infer_outputs[0].data.float_data() + for a, b in zip(np_data[0], infer_out): + g_a = float("{:.6g}".format(a)) + g_b = float("{:.6g}".format(b)) + assert g_a == g_b def main(use_cuda, is_sparse, is_parallel): From 59e5cc51d63c71c6c9eab0e715fdbdad4e6e5314 Mon Sep 17 00:00:00 2001 From: WangZhen Date: Mon, 21 Jan 2019 21:24:05 +0800 Subject: [PATCH 054/123] Add quantization transform pass and UT. --- paddle/fluid/pybind/ir.cc | 4 +- .../paddle/fluid/contrib/slim/graph/graph.py | 20 ++++-- .../contrib/slim/quantization/__init__.py | 6 +- ...tion_performer.py => quantization_pass.py} | 60 +++++++++++----- ...performer.py => test_quantization_pass.py} | 70 +++++++++++++++---- python/paddle/fluid/framework.py | 40 +++++++++++ 6 files changed, 157 insertions(+), 43 deletions(-) rename python/paddle/fluid/contrib/slim/quantization/{quantization_performer.py => quantization_pass.py} (86%) rename python/paddle/fluid/contrib/slim/unitest/{test_quantization_performer.py => test_quantization_pass.py} (61%) diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index 24059140ab..ba0d4bb435 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -148,8 +148,8 @@ void BindNode(py::module *m) { }) .def("outputs_append", [](Node &self, Node &node) { self.outputs.push_back(&node); }) - .def_readwrite("inputs", &Node::inputs) - .def_readwrite("outputs", &Node::outputs); + .def_readonly("inputs", &Node::inputs) + .def_readonly("outputs", &Node::outputs); py::enum_(node, "Type") .value("Operation", Node::Type::kOperation) diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py index 61f9f950c4..80deeee879 100644 --- a/python/paddle/fluid/contrib/slim/graph/graph.py +++ b/python/paddle/fluid/contrib/slim/graph/graph.py @@ -26,10 +26,20 @@ class PyGraph(object): PyGraph uses core.Graph as the delegation to accomplish the manipulation. """ - def __init__(self, graph): + def __init__(self, graph, for_test=False): + """ + Construct the PyGraph using core.Graph. + Args: + graph(core.Graph): C++ Graph. + for_test(bool): True for the test graph and false for the train graph. + """ assert isinstance( graph, core.Graph), 'graph must be the instance of core.Graph.' self.graph = graph + self.for_test = for_test + + def is_test(self): + return self.for_test def all_parameters(self): param_nodes = set() @@ -103,7 +113,7 @@ class PyGraph(object): remove_nodes = set(remove_nodes) core.graph_safe_remove_nodes(self.graph, remove_nodes) - def draw_graph(self, save_path, name, marked_nodes=None): + def draw(self, save_path, name, marked_nodes=None): def _convert_to_pdf(dot_file_path): pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf' exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \ @@ -126,6 +136,8 @@ class PyGraph(object): if not isinstance(marked_nodes, set): marked_nodes = set(marked_nodes) marked_nodes = marked_nodes - remove_ctr_vars + if self.graph.has('__graphviz__marked_node__'): + self.graph.erase('__graphviz__marked_node__') self.graph.set('__graphviz__marked_node__', marked_nodes) viz_dot_path = os.path.join(save_path, name) + '.dot' viz_pass = core.get_pass('graph_viz_pass') @@ -137,8 +149,8 @@ class PyGraph(object): convert_pass = core.get_pass('graph_to_program_pass') convert_pass.set_program('program', Program().desc) convert_pass.apply(self.graph) - program = Program() - program.desc = convert_pass.get_program('program') + desc = convert_pass.get_program('program') + program = Program.construct_from_desc(desc) return program diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py index f522385417..6c26475f48 100644 --- a/python/paddle/fluid/contrib/slim/quantization/__init__.py +++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py @@ -14,7 +14,7 @@ from __future__ import print_function -from . import quantization_performer -from .quantization_performer import * +from . import quantization_pass +from .quantization_pass import * -__all__ = quantization_performer.__all__ +__all__ = quantization_pass.__all__ diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py similarity index 86% rename from python/paddle/fluid/contrib/slim/quantization/quantization_performer.py rename to python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index ac84b763a6..3c33a513ff 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -15,22 +15,26 @@ import collections import numpy as np from .... import core +from ....framework import Program +from ....framework import Variable from ....initializer import Constant from .... import unique_name from ..graph import PyGraph -__all__ = ['QuantizationPerformer'] +__all__ = ['QuantizationTransformPass'] -class QuantizationPerformer(object): +class QuantizationTransformPass(object): def __init__(self, + scope=None, + program_exe=None, weight_bits=8, activation_bits=8, activation_quantize_type='abs_max', weight_quantize_type='abs_max', window_size=10000): """ - Convert and rewrite the IRGraph according to weight and + Convert and rewrite the PyGraph according to weight and activation quantization type. Args: weight_bits (int): quantization bit number for weights, @@ -48,15 +52,21 @@ class QuantizationPerformer(object): window_size (int): the window size for 'range_abs_max' quantization. Examples: .. code-block:: python - # the original graph will be rewrite, if you don't want to - # change it, please clone at first. - # graph = graph.clone() - from paddle.fluid.contrib.slim import * - from paddle.fluid.contrib.quantize import * - graph = IRGraph(program) - performer = QuantizationPerformer() - performer.quantize_transform(graph) + # The original graph will be rewrite. + import paddle.fluid as fluid + from paddle.fluid.contrib.slim.quantization \ + import QuantizationTransformPass + from paddle.fluid.contrib.slim.graph import PyGraph + from paddle.fluid import core + + graph = PyGraph(core.Graph(program.desc), for_test=False) + exe = fluid.Executor(fluid.CPUPlace()) + transform_pass = QuantizationTransformPass(fluid.global_scope(), + exe) + transform_pass.apply(graph) """ + self.scope = scope + self.program_exe = program_exe self.weight_bits = weight_bits self.activation_bits = activation_bits @@ -74,7 +84,7 @@ class QuantizationPerformer(object): self.weight_quantize_type = weight_quantize_type self.window_size = window_size - self.need_inited_outer = collections.OrderedDict() + self.need_initialized = collections.OrderedDict() self.quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] self.quantizable_grad_ops = [ '%s_grad' % (op) for op in self.quantizable_ops @@ -86,11 +96,11 @@ class QuantizationPerformer(object): self.is_test = None self.global_step = None - def quantize_transform(self, graph, is_test): - self.need_inited_outer.clear() - self.is_test = is_test + def apply(self, graph): assert isinstance(graph, PyGraph), 'graph must be the instance of PyGraph.' + self.need_initialized.clear() + self.is_test = graph.is_test() # marked the variable which has been dequantized. dequantized_vars = collections.OrderedDict() params = [p.name() for p in graph.all_parameters()] @@ -138,7 +148,19 @@ class QuantizationPerformer(object): if op.name() in self.quantizable_grad_ops: _transform_backward(graph, op) - return self.need_inited_outer + if len(self.need_initialized) > 0: + assert self.scope is not None, \ + 'The scope cannot be set None when activation_quantize_type equals to range_abs_max.' + assert self.program_exe is not None, \ + 'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.' + init_program = Program() + for var_desc, initializer in self.need_initialized.iteritems(): + var = Variable.construct_from_desc(init_program.global_block(), + var_desc) + initializer(var, init_program.global_block()) + self.program_exe.run(program=init_program, scope=self.scope) + + return graph def _create_global_step(self, graph): if self.weight_quantize_type == 'range_abs_max' or \ @@ -153,7 +175,7 @@ class QuantizationPerformer(object): var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[1], var_dtype=core.VarDesc.VarType.INT64) - self.need_inited_outer[global_step_in.var()] = \ + self.need_initialized[global_step_in.var()] = \ Constant(value=0, force_cpu=True) global_step_out = graph.create_var_node_from_desc( global_step_in.var()) @@ -220,7 +242,7 @@ class QuantizationPerformer(object): var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[1], var_dtype=var_node.var().dtype()) - self.need_inited_outer[scale_in_node.var()] = Constant(value=0.001) + self.need_initialized[scale_in_node.var()] = Constant(value=0.001) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) inputs = {'X': var_node, 'InScale': scale_in_node} @@ -233,7 +255,7 @@ class QuantizationPerformer(object): var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[self.window_size], var_dtype=var_node.var().dtype()) - self.need_inited_outer[scales_node.var()] = Constant(value=0) + self.need_initialized[scales_node.var()] = Constant(value=0) inputs['Iter'] = self.global_step outputs['OutScales'] = scales_node attrs = { diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py similarity index 61% rename from python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py rename to python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py index 771d880a28..31188bedbb 100644 --- a/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py +++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py @@ -15,11 +15,10 @@ import unittest import random import numpy as np -import paddle import paddle.fluid as fluid import six from paddle.fluid.framework import Program -from paddle.fluid.contrib.slim.quantization import QuantizationPerformer +from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass from paddle.fluid.contrib.slim.graph import PyGraph from paddle.fluid import core @@ -66,22 +65,39 @@ def residual_block(num): return loss -class TestQuantizationPerformer(unittest.TestCase): +class TestQuantizationTransformPass(unittest.TestCase): def setUp(self): - # since quant_op and dequant_op is not ready, use cos and sin for test - self.weight_quant_op_type = 'fake_quantize_abs_max' - self.dequant_op_type = 'fake_dequantize_max_abs' self.quantizable_op_and_inputs = { 'conv2d': ['Input', 'Filter'], 'depthwise_conv2d': ['Input', 'Filter'], 'mul': ['X', 'Y'] } - self.quantizable_op_grad_and_inputs = { + self.quantizable_grad_op_inputs = { 'conv2d_grad': ['Input', 'Filter'], 'depthwise_conv2d_grad': ['Input', 'Filter'], 'mul_grad': ['X', 'Y'] } + def check_program(self, transform_pass, program): + quantized_ops = set() + for block in program.blocks: + for op in block.ops: + # check forward + if op.type in self.quantizable_op_and_inputs: + for arg_name in op.input_arg_names: + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + quantized_ops.add(arg_name) + + for op in block.ops: + # check backward + if op.type in self.quantizable_grad_op_inputs: + for pname in self.quantizable_grad_op_inputs[op.type]: + arg_name = op.input(pname)[0] + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + self.assertTrue(arg_name in quantized_ops) + def linear_fc_quant(self, quant_type): main = fluid.Program() startup = fluid.Program() @@ -89,14 +105,26 @@ class TestQuantizationPerformer(unittest.TestCase): loss = linear_fc(3) opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) - graph = PyGraph(core.Graph(main.desc)) - performer = QuantizationPerformer(activation_quantize_type=quant_type) - performer.quantize_transform(graph, False) + exe = fluid.Executor(fluid.CPUPlace()) + graph = PyGraph(core.Graph(main.desc), for_test=False) + transform_pass = QuantizationTransformPass( + scope=fluid.global_scope(), + program_exe=exe, + activation_quantize_type=quant_type) + transform_pass.apply(graph) marked_nodes = set() for op in graph.all_ops(): if op.name().find('quantize') > -1: marked_nodes.add(op) - graph.draw_graph('.', 'quantize_fc_' + quant_type, marked_nodes) + graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes) + program = graph.to_program() + self.check_program(transform_pass, program) + val_graph = PyGraph(core.Graph(program.desc), for_test=False) + val_marked_nodes = set() + for op in val_graph.all_ops(): + if op.name().find('quantize') > -1: + val_marked_nodes.add(op) + val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes) def test_linear_fc_quant_abs_max(self): self.act_quant_op_type = 'fake_quantize_abs_max' @@ -113,14 +141,26 @@ class TestQuantizationPerformer(unittest.TestCase): loss = residual_block(2) opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) - graph = PyGraph(core.Graph(main.desc)) - performer = QuantizationPerformer(activation_quantize_type=quant_type) - performer.quantize_transform(graph, False) + exe = fluid.Executor(fluid.CPUPlace()) + graph = PyGraph(core.Graph(main.desc), for_test=False) + transform_pass = QuantizationTransformPass( + scope=fluid.global_scope(), + program_exe=exe, + activation_quantize_type=quant_type) + transform_pass.apply(graph) marked_nodes = set() for op in graph.all_ops(): if op.name().find('quantize') > -1: marked_nodes.add(op) - graph.draw_graph('.', 'quantize_residual_' + quant_type, marked_nodes) + graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes) + program = graph.to_program() + self.check_program(transform_pass, program) + val_graph = PyGraph(core.Graph(program.desc), for_test=False) + val_marked_nodes = set() + for op in val_graph.all_ops(): + if op.name().find('quantize') > -1: + val_marked_nodes.add(op) + val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes) def test_residual_block_abs_max(self): self.act_quant_op_type = 'fake_quantize_abs_max' diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 8d061f41f0..3fd625109a 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -378,6 +378,27 @@ class Variable(object): self._ivar.desc = self.desc self._ivar.stop_gradient = stop_gradient + @staticmethod + def construct_from_desc(block, desc): + """ + Construct a Variable from variable desc. + Args: + desc(core.VarDesc): The variable desc for constructing. + + Returns: + Variable: A variable. + """ + v = Variable( + block=block, + type=desc.type(), + name=desc.name(), + shape=desc.shape(), + dtype=desc.dtype(), + lod_level=desc.lod_level(), + persistable=desc.persistable()) + v.desc = desc + return v + def _numpy(self): tensor = self._ivar.value().get_tensor() return np.array(tensor) @@ -1925,6 +1946,25 @@ class Program(object): p._sync_with_cpp() return p + @staticmethod + def construct_from_desc(desc): + """ + Construct a program from program desc. + + Notes: All information about parameters will be lost. + + Args: + desc(core.ProgramDesc): The program desc for constructing. + + Returns: + Program: A program. + """ + p = Program() + p.desc = desc + p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())] + p._sync_with_cpp() + return p + @property def random_seed(self): """ From 787c5e714c84d2d2d699fb58e1d420b0fe4d09d6 Mon Sep 17 00:00:00 2001 From: WangZhen Date: Mon, 21 Jan 2019 21:33:56 +0800 Subject: [PATCH 055/123] Update the API.spec. test=develop. --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 50ffef72ba..3a0c8f888e 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1,6 +1,7 @@ paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.Program.construct_from_desc ArgSpec(args=['desc'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) From 1888337a49584f356d47561389bde180a07e9586 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 21 Jan 2019 20:51:18 +0800 Subject: [PATCH 056/123] tweak the executor implementation to better match origin behavior test=develop --- python/paddle/fluid/executor.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 20aa6054fe..f6bee559ea 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -305,7 +305,9 @@ class Executor(object): def __init__(self, place): self.place = place self.program_caches = dict() - self.executor = None + p = core.Place() + p.set_place(self.place) + self._default_executor = core.Executor(p) self._closed = False def _get_program_cache(self, program_cache_key): @@ -397,12 +399,13 @@ class Executor(object): >>> ... >>> exe.close() """ - if not self._closed and self.executor: - self.executor.close() + if not self._closed: + self._default_executor.close() self._closed = True def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, return_numpy): + exe = program._executor if isinstance(feed, dict): feed_tensor_dict = dict() for feed_name in feed: @@ -414,8 +417,7 @@ class Executor(object): feed_tensor.set(feed[feed_name], core.CPUPlace()) feed_tensor_dict[feed_name] = feed_tensor - self.executor.feed_and_split_tensor_into_local_scopes( - feed_tensor_dict) + exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) elif isinstance(feed, list) or isinstance(feed, tuple): if len(feed) != len(program._places): raise ValueError( @@ -436,10 +438,10 @@ class Executor(object): tensor = tmp res_dict[feed_name] = tensor res.append(res_dict) - self.executor.feed_tensors_into_local_scopes(res) + exe.feed_tensors_into_local_scopes(res) fetch_var_names = list(map(_to_name_str, fetch_list)) - self.executor.run(fetch_var_names, fetch_var_name) + exe.run(fetch_var_names, fetch_var_name) arr = scope.find_var(fetch_var_name).get_lod_tensor_array() if return_numpy: @@ -511,12 +513,9 @@ class Executor(object): compiled = isinstance(program, compiler.CompiledProgram) # For backward compatibility, run directly. if not compiled: - if not self.executor: - p = core.Place() - p.set_place(self.place) - self.executor = core.Executor(p) return self._run( program, + self._default_executor, feed=feed, fetch_list=fetch_list, feed_var_name=feed_var_name, @@ -526,7 +525,6 @@ class Executor(object): use_program_cache=use_program_cache) program._compile(scope, self.place) - self.executor = program._executor if program._is_data_parallel: return self._run_parallel( program, @@ -542,6 +540,7 @@ class Executor(object): # performance. return self._run( program._program, + self._default_executor, feed=feed, fetch_list=fetch_list, feed_var_name=feed_var_name, @@ -550,8 +549,8 @@ class Executor(object): return_numpy=return_numpy, use_program_cache=use_program_cache) - def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, - scope, return_numpy, use_program_cache): + def _run(self, program, exe, feed, fetch_list, feed_var_name, + fetch_var_name, scope, return_numpy, use_program_cache): if feed is None: feed = {} @@ -589,7 +588,7 @@ class Executor(object): fetch_var_name=fetch_var_name) self._feed_data(program, feed, feed_var_name, scope) - self.executor.run(program.desc, scope, 0, True, True) + exe.run(program.desc, scope, 0, True, True) outs = self._fetch_data(fetch_list, fetch_var_name, scope) if return_numpy: outs = as_numpy(outs) From 1d31a0e10c1ba59e17857b7360cc71bdcd9842d1 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 21 Jan 2019 22:07:48 +0800 Subject: [PATCH 057/123] resolve conflicts test=develop --- python/paddle/fluid/executor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index f6bee559ea..d3ff14a179 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -534,7 +534,7 @@ class Executor(object): fetch_var_name=fetch_var_name, return_numpy=return_numpy) elif program._is_inference: - return self._run_inference(program, feed) + return self._run_inference(program._executor, feed) else: # TODO(panyx0718): Can compile program to optimize executor # performance. @@ -594,5 +594,5 @@ class Executor(object): outs = as_numpy(outs) return outs - def _run_inference(self, program, feed): - return self.executor.run(feed) + def _run_inference(self, exe, feed): + return exe.run(feed) From 7f8b40f68ddc23c838c4d1faaf0cfa8721c78753 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 22 Jan 2019 10:04:03 +0800 Subject: [PATCH 058/123] Fix brpc complation error. (#15451) --- paddle/fluid/operators/distributed/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 6a61a8d786..cb492f9995 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -37,7 +37,7 @@ else() variable_response.cc collective_client.cc collective_server.cc ${BRPC_SRCS} - PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto + PROTO send_recv.proto DEPS lod_tensor selected_rows memory) set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib) From a1326cf363599f41ed4ecdf5b69b8815a9e54f2e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 22 Jan 2019 10:25:50 +0800 Subject: [PATCH 059/123] add NumpyArrayInitializer and use it to refactor nce op --- python/paddle/fluid/initializer.py | 61 ++++++++++++++++++- python/paddle/fluid/layers/nn.py | 27 ++++---- python/paddle/fluid/layers/tensor.py | 45 ++++---------- .../fluid/tests/unittests/test_layers.py | 12 ---- 4 files changed, 87 insertions(+), 58 deletions(-) diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 8a2cd4a929..5e99007031 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -24,7 +24,8 @@ __all__ = [ 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', 'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer', 'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer', - 'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer' + 'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer', + 'NumpyArrayInitializer' ] _force_init_on_cpu_ = False @@ -683,6 +684,64 @@ class BilinearInitializer(Initializer): return op +class NumpyArrayInitializer(Initializer): + """Init an parameter with an numpy array + + Args: + value (numpy): numpy array to initialize the variable + + Examples: + .. code-block:: python + + fc = fluid.layers.fc(input=x, size=10, + param_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2]))) + """ + + def __init__(self, value): + import numpy + assert isinstance(value, numpy.ndarray) + super(NumpyArrayInitializer, self).__init__() + self._value = value + + def __call__(self, var, block): + """Add constant initialization ops for a variable + + Args: + var: Variable that needs to be initialized + block: The block in which initialization ops + should be added + + Returns: + the initialization op + """ + assert isinstance(var, framework.Variable) + assert isinstance(block, framework.Block) + # Initialization Ops should be prepended and not appended + dtype = framework.convert_np_dtype_to_dtype_(self._value.dtype) + if dtype == VarDesc.VarType.FP32: + value_name = "fp32_values" + values = [float(v) for v in self._value.flat] + elif dtype == VarDesc.VarType.INT32: + value_name = "int32_values" + values = [int(v) for v in self._value.flat] + else: + raise ValueError("Unsupported dtype %s", self._value.dtype) + if self._value.size > 1024 * 1024 * 5: + raise ValueError("The size of input is too big. Please consider " + "saving it to file and 'load_op' to load it") + op = block._prepend_op( + type='assign_value', + outputs={'Out': var}, + attrs={ + 'dtype': dtype, + 'shape': list(input.shape), + value_name: values + }, + stop_gradient=True) + var.op = op + return op + + # We short the class name, since users will use the initializer with the package # name. The sample code: # diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index deadb16221..709d2c07c6 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -22,7 +22,7 @@ import six import os import inspect from ..layer_helper import LayerHelper -from ..initializer import Normal, Constant +from ..initializer import Normal, Constant, NumpyArrayInitializer from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ @@ -5181,16 +5181,21 @@ def nce(input, alias_probs_[little[0]] = 1.0 alias_[little[0]] = -1 - probs = assign( - input=np.array(custom_dist).astype('float32'), init_once=True) - custom_alias = assign( - input=np.array(alias_).astype('int32'), init_once=True) - custom_alias_probs = assign( - input=np.array(alias_probs_).astype('float32'), init_once=True) - - inputs['CustomDistProbs'] = probs - inputs['CustomDistAlias'] = custom_alias - inputs['CustomDistAliasProbs'] = custom_alias_probs + def _init_by_numpy_array(numpy_array): + ret = helper.create_parameter( + attr=ParamAttr(), + shape=numpy_array.shape, + dtype=numpy_array.dtype, + default_initializer=NumpyArrayInitializer(numpy_array)) + ret.stop_gradient = True + return ret + + inputs['CustomDistProbs'] = _init_by_numpy_array( + np.array(custom_dist).astype('float32')) + inputs['CustomDistAlias'] = _init_by_numpy_array( + np.array(alias_).astype('int32')) + inputs['CustomDistAliasProbs'] = _init_by_numpy_array( + np.array(alias_probs_).astype('float32')) sampler = 2 else: raise Exception("Unsupported sampler type.") diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index bd2a729469..ce9f508c9f 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -291,7 +291,7 @@ def sums(input, out=None): return out -def assign(input, output=None, init_once=False): +def assign(input, output=None): """ **Assign** @@ -300,7 +300,6 @@ def assign(input, output=None, init_once=False): Args: input(Variable|numpy.ndarray): The source variable output(Variable|None): The destination variable - init_once(bool|false): assign value into global var only in startup program. Returns: Variable: The destination variable that was supplied as the *output*. @@ -314,22 +313,10 @@ def assign(input, output=None, init_once=False): """ helper = LayerHelper('assign', **locals()) if output is None: - if init_once: - output = helper.create_parameter( - attr=ParamAttr(), - shape=input.shape, - dtype=input.dtype, - default_initializer=Constant(0.0)) - output.stop_gradient = True - else: - output = helper.create_variable_for_type_inference( - dtype=input.dtype) + output = helper.create_variable_for_type_inference(dtype=input.dtype) if isinstance(input, Variable): - if init_once: - raise ValueError("init once only support numpy assign!") helper.append_op( type='assign', inputs={'X': [input]}, outputs={'Out': [output]}) - elif isinstance(input, numpy.ndarray): dtype = convert_np_dtype_to_dtype_(input.dtype) if dtype == VarDesc.VarType.FP32: @@ -340,28 +327,18 @@ def assign(input, output=None, init_once=False): values = [int(v) for v in input.flat] else: raise ValueError("Unsupported dtype %s", input.dtype) - if input.size > 1024 * 1024 * 5: + if input.size > 1024 * 1024: raise ValueError("The size of input is too big. Please consider " "saving it to file and 'load_op' to load it") - if init_once: - helper.startup_program.global_block().append_op( - type='assign_value', - outputs={'Out': [output]}, - attrs={ - 'dtype': dtype, - 'shape': list(input.shape), - value_name: values - }) - else: - helper.append_op( - type='assign_value', - outputs={'Out': [output]}, - attrs={ - 'dtype': dtype, - 'shape': list(input.shape), - value_name: values - }) + helper.append_op( + type='assign_value', + outputs={'Out': [output]}, + attrs={ + 'dtype': dtype, + 'shape': list(input.shape), + value_name: values + }) else: raise ValueError("Wrong type for assign input: %s" % type(input)) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 2e2f9a5583..90f5d797a6 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1023,18 +1023,6 @@ class TestBook(unittest.TestCase): print(str(program)) - def test_assign(self): - import numpy as np - startup = Program() - main = Program() - with program_guard(main, startup): - probs = layers.assign( - input=np.random.random([1, 2]).astype('float32'), - init_once=True) - - print(str(main)) - print(str(startup)) - if __name__ == '__main__': unittest.main() From 0aecf7c70e52e99bf7decda820f18039b3f373e6 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 22 Jan 2019 10:46:48 +0800 Subject: [PATCH 060/123] add TestNumpyArrayInitializer --- .../fluid/tests/unittests/test_initializer.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index ab7183f88d..2e70175d43 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -420,5 +420,25 @@ class TestMSRAInitializer(unittest.TestCase): self.assertEqual(init_op.type, 'assign_value') +class TestNumpyArrayInitializer(unittest.TestCase): + def test_numpy_array_initializer(self): + """Test the numpy array initializer with supplied arguments + """ + import numpy + program = framework.Program() + block = program.global_block() + for _ in range(2): + np_array = numpy.array([1, 2, 3, 4]).astype('float32') + block.create_parameter( + dtype=np_array.dtype, + shape=np_array.shape, + lod_level=0, + name="param", + initializer=initializer.NumpyArrayInitializer(np_array)) + self.assertEqual(len(block.ops), 1) + init_op = block.ops[0] + self.assertEqual(init_op.type, 'assign_value') + + if __name__ == '__main__': unittest.main() From ec213730bcb3ca627c59c1a45b82afa4a79aed45 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 22 Jan 2019 05:00:26 +0000 Subject: [PATCH 061/123] fix trt stream bug. BUG: After continuing to input different data, the output cannot be aligned test=develop --- .../tensorrt/convert/test_op_converter.cc | 4 +- .../inference/tensorrt/convert/ut_helper.h | 10 ++-- paddle/fluid/inference/tensorrt/engine.cc | 16 +++--- paddle/fluid/inference/tensorrt/engine.h | 50 +++---------------- .../fluid/inference/tensorrt/test_engine.cc | 4 +- .../operators/tensorrt/tensorrt_engine_op.h | 9 +++- .../tensorrt/tensorrt_engine_op_test.cc | 4 +- 7 files changed, 31 insertions(+), 66 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 01d7f700da..c5a413221e 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) { // init trt engine cudaStream_t stream_; std::unique_ptr engine_; - engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_)); - engine_->InitNetwork(); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset(new TensorRTEngine(5, 1 << 15, stream_)); + engine_->InitNetwork(); engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3(2, 5, 5)); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index f313beb73b..e83961f3d7 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -78,11 +78,9 @@ class TRTConvertValidation { scope_(scope), if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { - // create engine. - engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_)); - engine_->InitNetwork(); - PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_)); + engine_->InitNetwork(); } // Declare a Variable as input with random initialization. @@ -175,7 +173,7 @@ class TRTConvertValidation { op_->Run(scope_, place); // Execute TRT. engine_->Execute(batch_size); - cudaStreamSynchronize(*engine_->stream()); + cudaStreamSynchronize(engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); const size_t output_space_size = 3000; @@ -184,7 +182,7 @@ class TRTConvertValidation { std::vector fluid_out; std::vector trt_out(output_space_size); engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); - cudaStreamSynchronize(*engine_->stream()); + cudaStreamSynchronize(engine_->stream()); auto* var = scope_.FindVar(output); auto tensor = var->GetMutable(); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index f739752cbc..78b590f15d 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) { PADDLE_ENFORCE(buf.device == DeviceType::GPU); buffers.push_back(buf.buffer); } - PADDLE_ENFORCE_NOT_NULL(stream_); - infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); - cudaStreamSynchronize(*stream_); + infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr); + cudaStreamSynchronize(stream_); SetRuntimeBatch(batch_size); } TensorRTEngine::~TensorRTEngine() { - cudaStreamSynchronize(*stream_); + cudaStreamSynchronize(stream_); // clean buffer for (auto &buf : buffers_) { if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { @@ -173,7 +172,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToDevice, *stream_), + cudaMemcpyDeviceToDevice, stream_), 0); } @@ -194,7 +193,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToHost, *stream_)); + cudaMemcpyDeviceToHost, stream_)); } Buffer &TensorRTEngine::buffer(const std::string &name) { @@ -211,12 +210,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_NOT_NULL(data); - PADDLE_ENFORCE_NOT_NULL(stream_); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); buf.size = size; PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyHostToDevice, *stream_)); + cudaMemcpyHostToDevice, stream_)); } void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, @@ -227,7 +225,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyDeviceToDevice, *stream_)); + cudaMemcpyDeviceToDevice, stream_)); } void TensorRTEngine::SetITensor(const std::string &name, diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index f5b2c28ba9..65ab7f3caa 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -54,17 +54,14 @@ class TensorRTEngine : public EngineBase { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, - cudaStream_t* stream = nullptr, int device = 0, + TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, + int device = 0, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - stream_(stream ? stream : &default_stream_), + stream_(stream), logger_(logger), - device_(device) { - freshDeviceId(); - cudaStreamCreate(stream_); - } + device_(device) {} virtual ~TensorRTEngine(); @@ -102,7 +99,7 @@ class TensorRTEngine : public EngineBase { // NOTE this should be used after calling `FreezeNetwork`. Buffer& buffer(const std::string& name) override; - cudaStream_t* stream() { return stream_; } + cudaStream_t stream() { return stream_; } // Fill an input from CPU memory with name and size. void SetInputFromCPU(const std::string& name, const void* data, size_t size); @@ -158,9 +155,8 @@ class TensorRTEngine : public EngineBase { // batch size of the current data, will be updated each Executation. int batch_size_{-1}; - cudaStream_t* stream_; - // If stream_ is not set from outside, hold its own stream. - cudaStream_t default_stream_; + cudaStream_t stream_; + nvinfer1::ILogger& logger_; std::vector buffers_; @@ -208,38 +204,6 @@ class TensorRTEngine : public EngineBase { #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ engine__->network()->add##layer__(ARGS); -/* - * Helper to control the TensorRT engine's creation and deletion. - */ -class TRT_EngineManager { - public: - bool HasEngine(const std::string& name) const { - return engines_.count(name) != 0; - } - - // Get an engine called `name`. - TensorRTEngine* Get(const std::string& name) const { - return engines_.at(name).get(); - } - - // Create or get an engine called `name` - TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream, - const std::string& name, int gpu_device = 0) { - auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device); - engines_[name].reset(p); - return p; - } - - void DeleteALl() { - for (auto& item : engines_) { - item.second.reset(nullptr); - } - } - - private: - std::unordered_map> engines_; -}; - } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index da1f6535cb..9eed0f6ee9 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -27,8 +27,8 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - // ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(10, 1 << 10, &stream_); + ASSERT_EQ(0, cudaStreamCreate(&stream_)); + engine_ = new TensorRTEngine(10, 1 << 10, stream_); engine_->InitNetwork(); } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 88c4f50847..e7e990f759 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -96,9 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase { void RunTrt(const framework::Scope &scope, const platform::Place &dev_place) const { int runtime_batch = 1; + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx).stream(); if (trt_engine_.get() == nullptr) { trt_engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, nullptr, + max_batch_size_, workspace_size_, stream, boost::get(dev_place).device)); Prepare(scope, dev_place, trt_engine_.get()); } @@ -126,6 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase { } } + cudaStreamSynchronize(stream); PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); // Execute the engine. engine->Execute(runtime_batch); @@ -163,7 +168,7 @@ class TensorRTEngineOp : public framework::OperatorBase { output_index += 1; } - cudaStreamSynchronize(*engine->stream()); + cudaStreamSynchronize(stream); } void Prepare(const framework::Scope &scope, const platform::Place &dev_place, diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 287b0edc96..bb25a37584 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -99,7 +99,7 @@ TEST(TensorRTEngineOp, manual) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); + SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 20); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); @@ -193,7 +193,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); + SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 20); SetAttr>( engine_op_desc.Proto(), "parameters", std::vector({"y0", "y1", "y2", "y3"})); From 99d533d026188925186d1ab188130f73897dca70 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 22 Jan 2019 13:36:30 +0800 Subject: [PATCH 062/123] update TestNumpyArrayInitializer test=develop --- python/paddle/fluid/initializer.py | 2 +- python/paddle/fluid/tests/unittests/test_initializer.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 5e99007031..4f434328e4 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -734,7 +734,7 @@ class NumpyArrayInitializer(Initializer): outputs={'Out': var}, attrs={ 'dtype': dtype, - 'shape': list(input.shape), + 'shape': list(self._value.shape), value_name: values }, stop_gradient=True) diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index 2e70175d43..2d98b063d1 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -427,8 +427,8 @@ class TestNumpyArrayInitializer(unittest.TestCase): import numpy program = framework.Program() block = program.global_block() + np_array = numpy.random.random((10000)).astype("float32") for _ in range(2): - np_array = numpy.array([1, 2, 3, 4]).astype('float32') block.create_parameter( dtype=np_array.dtype, shape=np_array.shape, @@ -438,6 +438,7 @@ class TestNumpyArrayInitializer(unittest.TestCase): self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'assign_value') + assert (init_op.attr('fp32_values') == np_array).all() if __name__ == '__main__': From a71f7ed787766cc2bce9d27ea471acf4f64ab93e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 22 Jan 2019 14:09:06 +0800 Subject: [PATCH 063/123] update API.spec test=develop --- paddle/fluid/API.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index ad39542b4d..2e7e200484 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -67,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) From d8568acd194779a98047ea05e6a0a4e1d6aee243 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 22 Jan 2019 03:44:44 +0000 Subject: [PATCH 064/123] turn on remove_unnecessary_lock test=develop --- paddle/fluid/framework/details/build_strategy.h | 2 +- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/tests/unittests/test_dist_base.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 603df2e069..cd24a31759 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -91,7 +91,7 @@ struct BuildStrategy { int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; - bool remove_unnecessary_lock_{false}; + bool remove_unnecessary_lock_{true}; // NOTE: // Before you add new options, think if it's a general strategy that works diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b086c21898..3ea3826677 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -993,7 +993,7 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.remove_unnecessary_lock_ = b; }, - R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") + R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC") .def_property( "num_trainers", [](const BuildStrategy &self) { return self.num_trainers_; }, diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 3fcdc57906..03be3fe84b 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -122,7 +122,7 @@ class TestDistRunnerBase(object): if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( - len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") + len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass") mypass.set_int("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2": From b913463e83faf48a95a6db6f51357bb1af2066d4 Mon Sep 17 00:00:00 2001 From: WangZhen Date: Tue, 22 Jan 2019 14:52:12 +0800 Subject: [PATCH 065/123] Update according to the reviewers' suggestion. test=develop --- paddle/fluid/pybind/ir.cc | 4 +- paddle/fluid/pybind/pybind.cc | 8 +- .../paddle/fluid/contrib/slim/graph/graph.py | 135 +------------- .../slim/quantization/quantization_pass.py | 168 +++++++---------- .../slim/unitest/test_quantization_pass.py | 10 +- python/paddle/fluid/framework.py | 174 +++++++++++++++--- 6 files changed, 228 insertions(+), 271 deletions(-) diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index ba0d4bb435..24059140ab 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -148,8 +148,8 @@ void BindNode(py::module *m) { }) .def("outputs_append", [](Node &self, Node &node) { self.outputs.push_back(&node); }) - .def_readonly("inputs", &Node::inputs) - .def_readonly("outputs", &Node::outputs); + .def_readwrite("inputs", &Node::inputs) + .def_readwrite("outputs", &Node::outputs); py::enum_(node, "Type") .value("Operation", Node::Type::kOperation) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c55e8b0475..c470483756 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -797,18 +797,18 @@ All parameter, weight, gradient are variables in Paddle. py::class_> pass(m, "Pass"); pass.def(py::init()) .def("has", &ir::Pass::Has) - .def("set_program", + .def("set", [](ir::Pass &self, const std::string &attr_name, const ProgramDesc &attr) { return self.Set(attr_name, new ProgramDesc(attr)); }) .def( - "set_str", + "set", [](ir::Pass &self, const std::string &name, const std::string &attr) { self.Set(name, new std::string(attr)); }) - .def("set_int", [](ir::Pass &self, const std::string &name, - int val) { self.Set(name, new int(val)); }) + .def("set", [](ir::Pass &self, const std::string &name, + int val) { self.Set(name, new int(val)); }) .def("get_program", &ir::Pass::Get) .def("type", &ir::Pass::Type) .def("apply", [](ir::Pass &self, std::shared_ptr graph) { diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py index 80deeee879..f38d978341 100644 --- a/python/paddle/fluid/contrib/slim/graph/graph.py +++ b/python/paddle/fluid/contrib/slim/graph/graph.py @@ -18,140 +18,7 @@ from ....framework import Program from ....framework import Block from .... import core -__all__ = ['Graph', 'ImitationGraph', 'IRGraph', 'PyGraph'] - - -class PyGraph(object): - """ - PyGraph uses core.Graph as the delegation to accomplish the manipulation. - """ - - def __init__(self, graph, for_test=False): - """ - Construct the PyGraph using core.Graph. - Args: - graph(core.Graph): C++ Graph. - for_test(bool): True for the test graph and false for the train graph. - """ - assert isinstance( - graph, core.Graph), 'graph must be the instance of core.Graph.' - self.graph = graph - self.for_test = for_test - - def is_test(self): - return self.for_test - - def all_parameters(self): - param_nodes = set() - for node in self.graph.nodes(): - if node.is_var() and node.var() is not None and node.var( - ).persistable(): - param_nodes.add(node) - return param_nodes - - def all_vars(self): - return {node for node in self.graph.nodes() if node.is_var()} - - def all_ops(self): - return {node for node in self.graph.nodes() if node.is_op()} - - def create_param_node(self, name, var_type, shape, var_dtype): - var_desc = core.VarDesc(name) - var_desc.set_type(var_type) - var_desc.set_shape(shape) - var_desc.set_dtype(var_dtype) - var_desc.set_persistable(True) - return self.graph.create_var_node(var_desc) - - def create_var_node(self, name, var_type, shape, var_dtype): - var_desc = core.VarDesc(name) - var_desc.set_type(var_type) - var_desc.set_shape(shape) - var_desc.set_dtype(var_dtype) - return self.graph.create_var_node(var_desc) - - def create_var_node_from_desc(self, var_desc): - return self.graph.create_var_node(var_desc) - - def create_op_node(self, op_type, attrs, inputs, outputs): - op_desc = core.OpDesc() - op_desc.set_type(op_type) - for attr, value in attrs.iteritems(): - self._update_desc_attr(op_desc, attr, value) - for input_name, var_nodes in inputs.iteritems(): - if not isinstance(var_nodes, list): - var_nodes = [var_nodes] - op_desc.set_input(input_name, - [var_node.name() for var_node in var_nodes]) - for output_name, var_nodes in outputs.iteritems(): - if not isinstance(var_nodes, list): - var_nodes = [var_nodes] - op_desc.set_output(output_name, - [var_node.name() for var_node in var_nodes]) - return self.graph.create_op_node(op_desc) - - def create_op_node_from_desc(self, op_desc): - return self.graph.create_op_node(op_desc) - - def _update_desc_attr(self, desc, name, val): - """ - Update the value of desc's attribute by attribute's name. - """ - if isinstance(val, Block): - desc.set_block_attr(name, val.desc) - elif isinstance(val, list) and val and all( - isinstance(v, Block) for v in val): - desc.set_blocks_attr(name, [v.desc for v in val]) - elif isinstance(val, core.BlockDesc) or \ - isinstance(val, core.ProgramDesc): - desc.set_serialized_attr(name, val.serialize_to_string()) - else: - desc._set_attr(name, val) - - def safe_remove_nodes(self, remove_nodes): - if not isinstance(remove_nodes, set): - remove_nodes = set(remove_nodes) - core.graph_safe_remove_nodes(self.graph, remove_nodes) - - def draw(self, save_path, name, marked_nodes=None): - def _convert_to_pdf(dot_file_path): - pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf' - exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \ - + ' -o ' + pdf_save_path, shell=True) - if exited_code != 0: - print('The dot command is needed for creating pdf files.') - print('The {} is saved as the dot filetype.'.format( - dot_file_path)) - - remove_ctr_vars = set() - ops_num = 0 - for node in self.graph.nodes(): - if node.is_ctrl_var(): - remove_ctr_vars.add(node) - elif node.is_op(): - ops_num += 1 - print('Total ops num = {}.'.format(ops_num)) - self.safe_remove_nodes(remove_ctr_vars) - if marked_nodes is not None: - if not isinstance(marked_nodes, set): - marked_nodes = set(marked_nodes) - marked_nodes = marked_nodes - remove_ctr_vars - if self.graph.has('__graphviz__marked_node__'): - self.graph.erase('__graphviz__marked_node__') - self.graph.set('__graphviz__marked_node__', marked_nodes) - viz_dot_path = os.path.join(save_path, name) + '.dot' - viz_pass = core.get_pass('graph_viz_pass') - viz_pass.set_str('graph_viz_path', viz_dot_path) - viz_pass.apply(self.graph) - _convert_to_pdf(viz_dot_path) - - def to_program(self): - convert_pass = core.get_pass('graph_to_program_pass') - convert_pass.set_program('program', Program().desc) - convert_pass.apply(self.graph) - desc = convert_pass.get_program('program') - program = Program.construct_from_desc(desc) - return program +__all__ = ['Graph', 'ImitationGraph', 'IRGraph'] class Graph(object): diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 3c33a513ff..ce16a32415 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -13,13 +13,12 @@ # limitations under the License. import collections -import numpy as np from .... import core +from ....framework import IrGraph from ....framework import Program from ....framework import Variable from ....initializer import Constant from .... import unique_name -from ..graph import PyGraph __all__ = ['QuantizationTransformPass'] @@ -34,7 +33,7 @@ class QuantizationTransformPass(object): weight_quantize_type='abs_max', window_size=10000): """ - Convert and rewrite the PyGraph according to weight and + Convert and rewrite the IrGraph according to weight and activation quantization type. Args: weight_bits (int): quantization bit number for weights, @@ -56,19 +55,19 @@ class QuantizationTransformPass(object): import paddle.fluid as fluid from paddle.fluid.contrib.slim.quantization \ import QuantizationTransformPass - from paddle.fluid.contrib.slim.graph import PyGraph + from paddle.fluid.contrib.slim.graph import IrGraph from paddle.fluid import core - graph = PyGraph(core.Graph(program.desc), for_test=False) + graph = IrGraph(core.Graph(program.desc), for_test=False) exe = fluid.Executor(fluid.CPUPlace()) transform_pass = QuantizationTransformPass(fluid.global_scope(), exe) transform_pass.apply(graph) """ - self.scope = scope - self.program_exe = program_exe - self.weight_bits = weight_bits - self.activation_bits = activation_bits + self._scope = scope + self._program_exe = program_exe + self._weight_bits = weight_bits + self._activation_bits = activation_bits quant_type = ['abs_max', 'range_abs_max'] if activation_quantize_type not in quant_type: @@ -80,27 +79,27 @@ class QuantizationTransformPass(object): "Unknown weight_quantize_type: '%s'. It can only be ", "'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) - self.activation_quantize_type = activation_quantize_type - self.weight_quantize_type = weight_quantize_type - self.window_size = window_size + self._activation_quantize_type = activation_quantize_type + self._weight_quantize_type = weight_quantize_type + self._window_size = window_size - self.need_initialized = collections.OrderedDict() - self.quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] - self.quantizable_grad_ops = [ - '%s_grad' % (op) for op in self.quantizable_ops + self._need_initialized = collections.OrderedDict() + self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] + self._quantizable_grad_ops = [ + '%s_grad' % (op) for op in self._quantizable_ops ] - self.fake_quant_op_types = [ + self._fake_quant_op_types = [ 'fake_quantize_abs_max', 'fake_quantize_range_abs_max' ] - self.fake_dequant_op_types = ['fake_dequantize_max_abs'] - self.is_test = None - self.global_step = None + self._fake_dequant_op_types = ['fake_dequantize_max_abs'] + self._is_test = None + self._global_step = None def apply(self, graph): assert isinstance(graph, - PyGraph), 'graph must be the instance of PyGraph.' - self.need_initialized.clear() - self.is_test = graph.is_test() + IrGraph), 'graph must be the instance of IrGraph.' + self._need_initialized.clear() + self._is_test = graph.is_test() # marked the variable which has been dequantized. dequantized_vars = collections.OrderedDict() params = [p.name() for p in graph.all_parameters()] @@ -110,72 +109,69 @@ class QuantizationTransformPass(object): if var_node.name() in dequantized_vars: dequant_var_node = dequantized_vars[var_node.name()] else: - quant_bits = self.weight_bits if var_node.name() in params \ - else self.activation_bits - quant_type = self.weight_quantize_type if var_node.name() \ - in params else self.activation_quantize_type + quant_bits = self._weight_bits if var_node.name() in params \ + else self._activation_bits + quant_type = self._weight_quantize_type if var_node.name() \ + in params else self._activation_quantize_type quant_var_node, scale_var_node = self._insert_quant_op( graph, var_node, quant_bits, quant_type) dequant_var_node = self._insert_dequant_op( graph, quant_var_node, scale_var_node, quant_bits) dequantized_vars[var_node.name()] = dequant_var_node - self._update_input(var_node, dequant_var_node, op) - op.op()._rename_input(var_node.name(), dequant_var_node.name()) + graph.update_input_link(var_node, dequant_var_node, op) def _transform_backward(graph, op): no_dequanted_input_vars = True for var_node in op.inputs: if var_node.name() in dequantized_vars: dequant_var_node = dequantized_vars[var_node.name()] - self._update_input(var_node, dequant_var_node, op) - op.op()._rename_input(var_node.name(), - dequant_var_node.name()) + graph.update_input_link(var_node, dequant_var_node, op) no_dequanted_input_vars = False if no_dequanted_input_vars: raise ValueError("There is no dequanted inputs for op %s." % (op.name())) - if not self.is_test: + if not self._is_test: self._create_global_step(graph) ops = graph.all_ops() # The process of _transform_forward and _transform_backward is needed in two for loops. # The loop for transforming the forward graph: for op in ops: - if op.name() in self.quantizable_ops: + if op.name() in self._quantizable_ops: _transform_forward(graph, op) # The loop for renaming the inputs of backward op. for op in ops: - if op.name() in self.quantizable_grad_ops: + if op.name() in self._quantizable_grad_ops: _transform_backward(graph, op) - if len(self.need_initialized) > 0: - assert self.scope is not None, \ + if len(self._need_initialized) > 0: + assert self._scope is not None, \ 'The scope cannot be set None when activation_quantize_type equals to range_abs_max.' - assert self.program_exe is not None, \ + assert self._program_exe is not None, \ 'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.' init_program = Program() - for var_desc, initializer in self.need_initialized.iteritems(): - var = Variable.construct_from_desc(init_program.global_block(), - var_desc) + for var_desc, initializer in self._need_initialized.iteritems(): + var = Variable(init_program.global_block()) + var._set_desc(var_desc) initializer(var, init_program.global_block()) - self.program_exe.run(program=init_program, scope=self.scope) + self._program_exe.run(program=init_program, scope=self._scope) return graph def _create_global_step(self, graph): - if self.weight_quantize_type == 'range_abs_max' or \ - self.activation_quantize_type == 'range_abs_max': + if self._weight_quantize_type == 'range_abs_max' or \ + self._activation_quantize_type == 'range_abs_max': counter_name = '@STEP_COUNTER@' for node in graph.all_vars(): if node.name() == counter_name: - self.global_step = node - if self.global_step is None: + self._global_step = node + if self._global_step is None: global_step_in = graph.create_param_node( name=counter_name, var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[1], var_dtype=core.VarDesc.VarType.INT64) - self.need_initialized[global_step_in.var()] = \ + self._need_initialized[global_step_in.var()] = \ Constant(value=0, force_cpu=True) global_step_out = graph.create_var_node_from_desc( global_step_in.var()) @@ -184,9 +180,9 @@ class QuantizationTransformPass(object): attrs={'step': 1.0}, inputs={'X': global_step_in}, outputs={'Out': global_step_out}) - self._link_to(global_step_in, increment_op) - self._link_to(increment_op, global_step_out) - self.global_step = global_step_out + graph.link_to(global_step_in, increment_op) + graph.link_to(increment_op, global_step_out) + self._global_step = global_step_out def _insert_quant_op(self, graph, var_node, quant_bits, quant_type): """ @@ -220,9 +216,9 @@ class QuantizationTransformPass(object): inputs={'X': var_node}, outputs={'Out': quant_var_node, 'OutScale': scale_var_node}) - self._link_to(var_node, quant_op_node) - self._link_to(quant_op_node, quant_var_node) - self._link_to(quant_op_node, scale_var_node) + graph.link_to(var_node, quant_op_node) + graph.link_to(quant_op_node, quant_var_node) + graph.link_to(quant_op_node, scale_var_node) return quant_var_node, scale_var_node def _insert_quant_range_abs_max_op(self, graph, var_node, quant_bits): @@ -242,26 +238,26 @@ class QuantizationTransformPass(object): var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[1], var_dtype=var_node.var().dtype()) - self.need_initialized[scale_in_node.var()] = Constant(value=0.001) + self._need_initialized[scale_in_node.var()] = Constant(value=0.001) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) inputs = {'X': var_node, 'InScale': scale_in_node} outputs = {'Out': quant_var_node, 'OutScale': scale_out_node} - if not self.is_test: + if not self._is_test: # The name of scales_var_node maybe 'scales_0', 'scales_1', etc. scales_node = graph.create_param_node( name=unique_name.generate('scales'), var_type=core.VarDesc.VarType.LOD_TENSOR, - shape=[self.window_size], + shape=[self._window_size], var_dtype=var_node.var().dtype()) - self.need_initialized[scales_node.var()] = Constant(value=0) - inputs['Iter'] = self.global_step + self._need_initialized[scales_node.var()] = Constant(value=0) + inputs['Iter'] = self._global_step outputs['OutScales'] = scales_node attrs = { - 'window_size': self.window_size, + 'window_size': self._window_size, 'bit_length': quant_bits, - 'is_test': self.is_test + 'is_test': self._is_test } quant_op_node = graph.create_op_node( op_type='fake_quantize_range_abs_max', @@ -269,14 +265,14 @@ class QuantizationTransformPass(object): inputs=inputs, outputs=outputs) - self._link_to(var_node, quant_op_node) - self._link_to(scale_in_node, quant_op_node) - self._link_to(quant_op_node, quant_var_node) - self._link_to(quant_op_node, scale_out_node) + graph.link_to(var_node, quant_op_node) + graph.link_to(scale_in_node, quant_op_node) + graph.link_to(quant_op_node, quant_var_node) + graph.link_to(quant_op_node, scale_out_node) - if not self.is_test: - self._link_to(self.global_step, quant_op_node) - self._link_to(quant_op_node, scales_node) + if not self._is_test: + graph.link_to(self._global_step, quant_op_node) + graph.link_to(quant_op_node, scales_node) return quant_var_node, scale_out_node @@ -298,21 +294,11 @@ class QuantizationTransformPass(object): inputs={'X': var_node, 'Scale': scale_var_node}, outputs={'Out': dequant_var_node}) - self._link_to(var_node, dequant_op_node) - self._link_to(scale_var_node, dequant_op_node) - self._link_to(dequant_op_node, dequant_var_node) + graph.link_to(var_node, dequant_op_node) + graph.link_to(scale_var_node, dequant_op_node) + graph.link_to(dequant_op_node, dequant_var_node) return dequant_var_node - def _update_input(self, old_input_node, new_input_node, op_node): - old_input_node.outputs_remove(op_node) - op_node.inputs_remove(old_input_node) - new_input_node.outputs_append(op_node) - op_node.inputs_append(new_input_node) - - def _link_to(self, node_in, node_out): - node_in.outputs_append(node_out) - node_out.inputs_append(node_in) - def _quantized_var_name(self, var_name): """ Return quantized variable name for the input `var_name`. @@ -330,25 +316,3 @@ class QuantizationTransformPass(object): Return quantized variable name for the input `var_name`. """ return "%s.scale" % (var_name) - - def _original_var_name(self, var_name): - """ - Return the original variable name. - """ - if var_name.endswith('.quantized.dequantized'): - return var_name[:-len('.quantized.dequantized')] - if var_name.endswith('.quantized'): - return var_name[:-len('.quantized')] - if var_name.endswith('.dequantized'): - return var_name[:-len('.dequantized')] - if var_name.endswith('.scale'): - return var_name[:-len('.scale')] - else: - return var_name - - def _is_float(self, v): - return isinstance(v, float) or isinstance(v, np.float32) - - def _quant(self, x, scale, num_bits): - y = np.round(x / scale * ((1 << (num_bits - 1)) - 1)) - return y diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py index 31188bedbb..1bd4b95d6b 100644 --- a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py @@ -18,8 +18,8 @@ import numpy as np import paddle.fluid as fluid import six from paddle.fluid.framework import Program +from paddle.fluid.framework import IrGraph from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass -from paddle.fluid.contrib.slim.graph import PyGraph from paddle.fluid import core @@ -106,7 +106,7 @@ class TestQuantizationTransformPass(unittest.TestCase): opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) exe = fluid.Executor(fluid.CPUPlace()) - graph = PyGraph(core.Graph(main.desc), for_test=False) + graph = IrGraph(core.Graph(main.desc), for_test=False) transform_pass = QuantizationTransformPass( scope=fluid.global_scope(), program_exe=exe, @@ -119,7 +119,7 @@ class TestQuantizationTransformPass(unittest.TestCase): graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes) program = graph.to_program() self.check_program(transform_pass, program) - val_graph = PyGraph(core.Graph(program.desc), for_test=False) + val_graph = IrGraph(core.Graph(program.desc), for_test=False) val_marked_nodes = set() for op in val_graph.all_ops(): if op.name().find('quantize') > -1: @@ -142,7 +142,7 @@ class TestQuantizationTransformPass(unittest.TestCase): opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) exe = fluid.Executor(fluid.CPUPlace()) - graph = PyGraph(core.Graph(main.desc), for_test=False) + graph = IrGraph(core.Graph(main.desc), for_test=False) transform_pass = QuantizationTransformPass( scope=fluid.global_scope(), program_exe=exe, @@ -155,7 +155,7 @@ class TestQuantizationTransformPass(unittest.TestCase): graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes) program = graph.to_program() self.check_program(transform_pass, program) - val_graph = PyGraph(core.Graph(program.desc), for_test=False) + val_graph = IrGraph(core.Graph(program.desc), for_test=False) val_marked_nodes = set() for op in val_graph.all_ops(): if op.name().find('quantize') > -1: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 1913f58e67..fc5e471ae3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -23,6 +23,7 @@ import traceback import six import numpy as np +import subprocess from .. import compat as cpt from .proto import framework_pb2 @@ -381,27 +382,6 @@ class Variable(object): self._ivar.desc = self.desc self._ivar.stop_gradient = stop_gradient - @staticmethod - def construct_from_desc(block, desc): - """ - Construct a Variable from variable desc. - Args: - desc(core.VarDesc): The variable desc for constructing. - - Returns: - Variable: A variable. - """ - v = Variable( - block=block, - type=desc.type(), - name=desc.name(), - shape=desc.shape(), - dtype=desc.dtype(), - lod_level=desc.lod_level(), - persistable=desc.persistable()) - v.desc = desc - return v - def _numpy(self): tensor = self._ivar.value().get_tensor() return np.array(tensor) @@ -1533,6 +1513,154 @@ class Block(object): return ret_var +class IrGraph(object): + """ + IrGraph uses core.Graph as the delegation to accomplish the manipulation. + """ + + def __init__(self, graph, for_test=False): + """ + Construct the IrGraph using core.Graph. + Args: + graph(core.Graph): C++ Graph. + for_test(bool): True for the test graph and false for the train graph. + """ + assert isinstance( + graph, core.Graph), 'graph must be the instance of core.Graph.' + self.graph = graph + self._for_test = for_test + + def is_test(self): + return self._for_test + + def all_parameters(self): + param_nodes = set() + for node in self.graph.nodes(): + if node.is_var() and node.var() is not None and node.var( + ).persistable(): + param_nodes.add(node) + return param_nodes + + def all_vars(self): + return {node for node in self.graph.nodes() if node.is_var()} + + def all_ops(self): + return {node for node in self.graph.nodes() if node.is_op()} + + def create_param_node(self, name, var_type, shape, var_dtype): + var_desc = core.VarDesc(name) + var_desc.set_type(var_type) + var_desc.set_shape(shape) + var_desc.set_dtype(var_dtype) + var_desc.set_persistable(True) + return self.graph.create_var_node(var_desc) + + def create_var_node(self, name, var_type, shape, var_dtype): + var_desc = core.VarDesc(name) + var_desc.set_type(var_type) + var_desc.set_shape(shape) + var_desc.set_dtype(var_dtype) + return self.graph.create_var_node(var_desc) + + def create_var_node_from_desc(self, var_desc): + return self.graph.create_var_node(var_desc) + + def create_op_node(self, op_type, attrs, inputs, outputs): + op_desc = core.OpDesc() + op_desc.set_type(op_type) + for attr, value in attrs.iteritems(): + self._update_desc_attr(op_desc, attr, value) + for input_name, var_nodes in inputs.iteritems(): + if not isinstance(var_nodes, list): + var_nodes = [var_nodes] + op_desc.set_input(input_name, + [var_node.name() for var_node in var_nodes]) + for output_name, var_nodes in outputs.iteritems(): + if not isinstance(var_nodes, list): + var_nodes = [var_nodes] + op_desc.set_output(output_name, + [var_node.name() for var_node in var_nodes]) + return self.graph.create_op_node(op_desc) + + def create_op_node_from_desc(self, op_desc): + return self.graph.create_op_node(op_desc) + + def update_input_link(self, old_input_node, new_input_node, op_node): + assert old_input_node in self.graph.nodes() and new_input_node in self.graph.nodes() and \ + op_node in self.graph.nodes(), 'Th three arguments must be in the graph nodes.' + old_input_node.outputs_remove(op_node) + op_node.inputs_remove(old_input_node) + new_input_node.outputs_append(op_node) + op_node.inputs_append(new_input_node) + op_node.op()._rename_input(old_input_node.name(), new_input_node.name()) + + def link_to(self, node_in, node_out): + assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \ + 'Th two arguments must be in the graph nodes.' + node_in.outputs_append(node_out) + node_out.inputs_append(node_in) + + def safe_remove_nodes(self, remove_nodes): + if not isinstance(remove_nodes, set): + remove_nodes = set(remove_nodes) + core.graph_safe_remove_nodes(self.graph, remove_nodes) + + def draw(self, save_path, name, marked_nodes=None): + def _convert_to_pdf(dot_file_path): + pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf' + exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \ + + ' -o ' + pdf_save_path, shell=True) + if exited_code != 0: + print('The dot command is needed for creating pdf files.') + print('The {} is saved as the dot filetype.'.format( + dot_file_path)) + + remove_ctr_vars = set() + ops_num = 0 + for node in self.graph.nodes(): + if node.is_ctrl_var(): + remove_ctr_vars.add(node) + elif node.is_op(): + ops_num += 1 + print('Total ops num = {}.'.format(ops_num)) + self.safe_remove_nodes(remove_ctr_vars) + if marked_nodes is not None: + if not isinstance(marked_nodes, set): + marked_nodes = set(marked_nodes) + marked_nodes = marked_nodes - remove_ctr_vars + if self.graph.has('__graphviz__marked_node__'): + self.graph.erase('__graphviz__marked_node__') + self.graph.set('__graphviz__marked_node__', marked_nodes) + viz_dot_path = os.path.join(save_path, name) + '.dot' + viz_pass = core.get_pass('graph_viz_pass') + viz_pass.set('graph_viz_path', viz_dot_path) + viz_pass.apply(self.graph) + _convert_to_pdf(viz_dot_path) + + def to_program(self): + convert_pass = core.get_pass('graph_to_program_pass') + convert_pass.set('program', Program().desc) + convert_pass.apply(self.graph) + desc = convert_pass.get_program('program') + program = Program._construct_from_desc(desc) + return program + + def _update_desc_attr(self, desc, name, val): + """ + Update the value of desc's attribute by attribute's name. + """ + if isinstance(val, Block): + desc.set_block_attr(name, val.desc) + elif isinstance(val, list) and val and all( + isinstance(v, Block) for v in val): + desc.set_blocks_attr(name, [v.desc for v in val]) + elif isinstance(val, core.BlockDesc) or \ + isinstance(val, core.ProgramDesc): + desc.set_serialized_attr(name, val.serialize_to_string()) + else: + desc._set_attr(name, val) + + class Program(object): """ Python Program. Beneath it is a ProgramDesc, which is used for @@ -1958,12 +2086,10 @@ class Program(object): return p @staticmethod - def construct_from_desc(desc): + def _construct_from_desc(desc): """ Construct a program from program desc. - Notes: All information about parameters will be lost. - Args: desc(core.ProgramDesc): The program desc for constructing. From c6f99a16451c47cfa12633d3b871c3e8940cbd48 Mon Sep 17 00:00:00 2001 From: WangZhen Date: Tue, 22 Jan 2019 14:56:12 +0800 Subject: [PATCH 066/123] Update API.spec. test=develop --- paddle/fluid/API.spec | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b4fc560a5a..ad39542b4d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1,7 +1,6 @@ paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.Program.construct_from_desc ArgSpec(args=['desc'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) From 2f4aee361a7bacbac375ea082b1a1a646c6b3b40 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 22 Jan 2019 07:20:52 +0000 Subject: [PATCH 067/123] fix comments test=develop --- .../fluid/inference/tests/api/tester_helper.h | 19 +++++++++++- .../inference/tests/api/trt_models_tester.cc | 31 +++++++++++++++++++ .../tensorrt/tensorrt_engine_op_test.cc | 4 +-- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index ac964dc0c8..8ee89c34f0 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -56,6 +56,13 @@ DECLARE_int32(paddle_num_threads); namespace paddle { namespace inference { +float Random(float low, float high) { + static std::random_device rd; + static std::mt19937 mt(rd()); + std::uniform_real_distribution dist(low, high); + return dist(mt); +} + void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { const auto *analysis_config = reinterpret_cast(config); @@ -176,7 +183,7 @@ void SetFakeImageInput(std::vector> *inputs, float *input_data = static_cast(input.data.data()); // fill input data, for profile easily, do not use random data here. for (size_t j = 0; j < len; ++j) { - *(input_data + j) = static_cast(j) / len; + *(input_data + j) = Random(0, 10.); } } (*inputs).emplace_back(input_slots); @@ -344,6 +351,16 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +void CompareNativeAndAnalysis( + PaddlePredictor *native_pred, PaddlePredictor *analysis_pred, + const std::vector> &inputs) { + int batch_size = FLAGS_batch_size; + std::vector native_outputs, analysis_outputs; + native_pred->Run(inputs[0], &native_outputs, batch_size); + analysis_pred->Run(inputs[0], &analysis_outputs, batch_size); + CompareResult(analysis_outputs, native_outputs); +} + template std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::stringstream ss; diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 9725c19032..8d17754293 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -107,6 +107,27 @@ void compare(std::string model_dir, bool use_tensorrt) { inputs_all); } +void compare_continuous_input(std::string model_dir, bool use_tensorrt) { + contrib::AnalysisConfig analysis_config; + SetConfig(&analysis_config, model_dir, true, + use_tensorrt, FLAGS_batch_size); + auto config = + reinterpret_cast(&analysis_config); + auto native_pred = CreateTestPredictor(config, false); + auto analysis_pred = CreateTestPredictor(config, true); + for (int i = 0; i < 100; i++) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + } + CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(), + inputs_all); + } +} + TEST(TensorRT_mobilenet, compare) { std::string model_dir = FLAGS_infer_model + "/mobilenet"; compare(model_dir, /* use_tensorrt */ true); @@ -157,5 +178,15 @@ TEST(AnalysisPredictor, use_gpu) { } } +TEST(resnet50, compare_continuous_input) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare_continuous_input(model_dir, true); +} + +TEST(resnet50, compare_continuous_input_native) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare_continuous_input(model_dir, false); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index bb25a37584..391e7a1c07 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -99,7 +99,7 @@ TEST(TensorRTEngineOp, manual) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 20); + SetAttr(engine_op_desc.Proto(), "workspace_size", 1 << 20); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); @@ -193,7 +193,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 20); + SetAttr(engine_op_desc.Proto(), "workspace_size", 1 << 20); SetAttr>( engine_op_desc.Proto(), "parameters", std::vector({"y0", "y1", "y2", "y3"})); From 3b668c157424dce5cbf52cbe813d15275616b1f3 Mon Sep 17 00:00:00 2001 From: WangZhen Date: Tue, 22 Jan 2019 15:37:04 +0800 Subject: [PATCH 068/123] Update some comments in the quantization transform pass. test=develop --- .../paddle/fluid/contrib/slim/quantization/quantization_pass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index ce16a32415..266a106bc5 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -313,6 +313,6 @@ class QuantizationTransformPass(object): def _quantized_scale_name(self, var_name): """ - Return quantized variable name for the input `var_name`. + Return the scale name of quantized variable for the input `var_name`. """ return "%s.scale" % (var_name) From f534c66d2d7d87aa580538513f4835439acd7bc0 Mon Sep 17 00:00:00 2001 From: flame Date: Tue, 22 Jan 2019 17:17:51 +0800 Subject: [PATCH 069/123] fix test_word2vec bug (#15462) fix test_word2vec float's equality bug --- python/paddle/fluid/tests/book/test_word2vec.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 48cb778927..487a29c839 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -220,9 +220,7 @@ def infer(use_cuda, save_dirname=None): np_data = np.array(results[0]) infer_out = infer_outputs[0].data.float_data() for a, b in zip(np_data[0], infer_out): - g_a = float("{:.6g}".format(a)) - g_b = float("{:.6g}".format(b)) - assert g_a == g_b + assert np.isclose(a, b), "a: {}, b: {}".format(a, b) def main(use_cuda, is_sparse, is_parallel): From bac08c4a263eeda61cc2f1bcf20d005f51f542ef Mon Sep 17 00:00:00 2001 From: WangZhen Date: Tue, 22 Jan 2019 18:26:00 +0800 Subject: [PATCH 070/123] Fix some bugs caused by set functions of the Pass class. test=develop --- python/paddle/fluid/tests/unittests/test_dist_base.py | 2 +- python/paddle/fluid/tests/unittests/test_pass_builder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 3fcdc57906..69a38618cd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -123,7 +123,7 @@ class TestDistRunnerBase(object): pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") - mypass.set_int("num_repeats", args.batch_merge_repeat) + mypass.set("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2": build_stra.num_trainers = len(args.endpoints.split(",")) diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py index 8c9e489e02..7e1c2572f0 100644 --- a/python/paddle/fluid/tests/unittests/test_pass_builder.py +++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py @@ -111,7 +111,7 @@ class TestPassBuilder(unittest.TestCase): pass_builder.remove_pass(len(pass_builder.all_passes()) - 1) self.assertEqual(origin_len + 1, len(pass_builder.all_passes())) - viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass") + viz_pass.set("graph_viz_path", "/tmp/test_viz_pass") self.check_network_convergence( use_cuda=core.is_compiled_with_cuda(), From 5a8bd82c0cef63bd7313171e8049953aa2db43f6 Mon Sep 17 00:00:00 2001 From: chengduo Date: Tue, 22 Jan 2019 06:40:47 -0600 Subject: [PATCH 071/123] Remove workspace_handle (#15376) * remove workspace_handle test=develop * set constant for loss test=develop --- paddle/fluid/operators/conv_fusion_op.cu.cc | 65 +++++++++++-------- .../operators/conv_transpose_cudnn_op.cu.cc | 57 ++++++++-------- .../fused/fusion_conv_inception_op.cu | 23 +++---- paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 24 +++---- 4 files changed, 93 insertions(+), 76 deletions(-) diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index d8b997cca6..f97ebecfdd 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -104,7 +104,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; auto handle = dev_ctx.cudnn_handle(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + Tensor cudnn_workspace; + void* cudnn_workspace_ptr = nullptr; CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); @@ -118,19 +120,24 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); VLOG(3) << "cuDNN forward algo " << algo; } else { + cudnn_workspace = + ctx.AllocateTmpTensor( + framework::make_ddim( + {static_cast(workspace_size_limit)}), + dev_ctx); + cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); + auto search_func = [&]() { int returned_algo_count; std::array fwd_perf_stat; - auto cudnn_find_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, - kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit)); - }; - workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); + + CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(), + cudnn_workspace_ptr, workspace_size_limit)); + VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { const auto& stat = fwd_perf_stat[i]; @@ -181,6 +188,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); + if (!cudnn_workspace_ptr) { + cudnn_workspace = + ctx.AllocateTmpTensor( + framework::make_ddim( + {static_cast(workspace_size_in_bytes)}), + dev_ctx); + cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); + } + if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. @@ -188,13 +204,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // cudnnConvolutionForward and cudnnAddTensor // ------------- cudnn conv forward and bias add --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); @@ -205,15 +220,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------------- cudnn conv+bias+act forward -------------------- ScalingParamType alpha1 = 1.0f; ScalingParamType alpha2 = residual ? 1.0f : 0.0f; - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, - output_data)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); } std::vector channels = ctx.Attr>("split_channels"); if (channels.size()) { diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index f44094ca6b..016cf8448c 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -104,16 +104,18 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int output_offset = output->numel() / output->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + auto temp_allocation = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( + workspace_size_in_bytes); + void* cudnn_workspace = temp_allocation->ptr(); + for (int g = 0; g < groups; g++) { - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, output_data + output_offset * g)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); } } }; @@ -209,20 +211,22 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { output_grad->numel() / output_grad->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + auto temp_allocation = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( + workspace_size_in_bytes); + void* cudnn_workspace = temp_allocation->ptr(); + if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); } } @@ -232,15 +236,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_filter_desc, filter_grad_data + filter_offset * g)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, + filter_grad_data + filter_offset * g)); } } } diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 6e13887866..c72a966c57 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -216,18 +216,19 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { out_datas.push_back( static_cast(output_data + (oc0 + oc1 + oc2) * h * w)); + auto temp_allocation = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( + workspace_size_in_bytes); + void* cudnn_workspace = temp_allocation->ptr(); + for (int i = 0; i < 4; ++i) { - auto func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], - static_cast(filters[i]->data()), conv_desc[i], - algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, - out_desc[i], out_datas[i], bias_desc[i], - static_cast(bias[i]->data()), cudnn_act_desc, - out_desc[i], out_datas[i])); - }; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - workspace_handle.RunFunc(func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], + static_cast(filters[i]->data()), conv_desc[i], + algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, out_desc[i], + out_datas[i], bias_desc[i], + static_cast(bias[i]->data()), cudnn_act_desc, + out_desc[i], out_datas[i])); } cudnnTensorDescriptor_t x_desc; diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc index a764d59410..5e16a209e7 100644 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc @@ -144,17 +144,19 @@ class CudnnCTCKernel : public framework::OpKernel { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size)); T* loss_data = loss->mutable_data(loss_dims, ctx.GetPlace()); - - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( - handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, - warpctc_label_lengths.data(), warpctc_logits_lengths.data(), - loss_data, cu_grad_desc, warpctc_grad_data, - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace, - workspace_size)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size); + math::SetConstant()( + ctx.template device_context(), loss, static_cast(0)); + + auto temp_allocation = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( + workspace_size); + void* cudnn_workspace = temp_allocation->ptr(); + + CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( + handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, + warpctc_label_lengths.data(), warpctc_logits_lengths.data(), loss_data, + cu_grad_desc, warpctc_grad_data, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, + cu_ctcloss_desc, cudnn_workspace, workspace_size)); } }; From f4dec5cdeeba98c3955e02decd74fb9c02fc3202 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 23 Jan 2019 10:45:28 +0800 Subject: [PATCH 072/123] Check collective server's data. (#15449) --- .../distributed/collective_server_test.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc index 5009058422..90f2f9fd65 100644 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/distributed/collective_client.h" #include "paddle/fluid/operators/distributed/collective_server.h" @@ -57,7 +58,7 @@ std::unique_ptr GenerateVars(platform::Place place) { auto* tensor = slr->mutable_value(); auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({20000, 1024})); + tensor->Resize(framework::make_ddim({3, 1024})); tensor->mutable_data(place); paddle::operators::math::set_constant(ctx, tensor, 32.7); @@ -80,6 +81,20 @@ void Gather(const std::vector& vars, std::vector dst; client->Gather(vars, &dst, *dev_ctx, scope); std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); + dev_ctx->Wait(); + + ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024})); + ASSERT_EQ(dst[0]->height(), 20000); + ASSERT_EQ(dst[0]->rows().size(), static_cast(3)); + for (int i = 0; i < 3; i++) { + ASSERT_EQ(dst[0]->rows()[i], i); + } + + std::vector vec; + TensorToVector(dst[0]->value(), *dev_ctx, &vec); + for (size_t i = 0; i < 3 * 1024; i++) { + ASSERT_FLOAT_EQ(vec[i], 32.7); + } } TEST(CollectiveServer, GPU) { From eaad3e4c3dc9926054ec2989cc780df734a433bb Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Wed, 23 Jan 2019 10:57:36 +0800 Subject: [PATCH 073/123] Add check of input in sequence_expand op. (#15466) * Add check of input in sequence_expand op. test=develop * Correct the unittest of sequence_expand op. test=develop --- paddle/fluid/operators/sequence_ops/sequence_expand_op.cc | 5 +++++ python/paddle/fluid/tests/unittests/test_sequence_expand.py | 3 +-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc index c07e6962e6..27e0201bd7 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc @@ -68,6 +68,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel { "Level number of Input(X)'s lod could be 0. Otherwise " "size of Input(X)'s first level lod should be equal to " "size of Input(Y)'s referred level lod."); + } else { + PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1, + "When Input(X)'s lod is null, the dims[0] of " + "Input(X) should match the " + "size of Input(Y)'s referred level lod."); } int64_t out_first_dim = 0; diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py index ffd4026dba..d33a57f675 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py @@ -81,11 +81,10 @@ class TestSequenceExpand(OpTest): class TestSequenceExpandCase1(TestSequenceExpand): def set_data(self): x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32') - x_lod = [[2, 3]] y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32') y_lod = [[2, 3], [2, 2, 3, 3, 3]] self.inputs = {'X': x_data, 'Y': (y_data, y_lod)} - self.attrs = {'ref_level': 0} + self.attrs = {'ref_level': 1} class TestSequenceExpandCase2(TestSequenceExpand): From 9f5108a6733c777eaf56806a113eeb776493a989 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 23 Jan 2019 11:41:05 +0800 Subject: [PATCH 074/123] Add cicheck_brpc (#15468) --- paddle/scripts/paddle_build.sh | 57 +++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index cda04451f5..bb7258ee59 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -164,6 +164,9 @@ function cmake_gen() { INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} fi + distibuted_flag=${WITH_DISTRIBUTE:-OFF} + grpc_flag=${WITH_GRPC:-${distibuted_flag}} + cat < Date: Wed, 23 Jan 2019 11:48:13 +0800 Subject: [PATCH 075/123] Add generate_mask_labels_op to support Mask-RCNN and refine some code. (#15371) * Add generate_mask_labels_op to support Mask-RCNN. * Refine sigmoid_cross_entropy to support nomalize mode. * Fix generator_proposals_label. * Use DeviceTemporaryAllocator in roi_pool and roi_algin. * Remove shape check in data_feeder. --- paddle/fluid/API.spec | 3 +- paddle/fluid/operators/affine_channel_op.cu | 15 +- .../fluid/operators/detection/CMakeLists.txt | 4 + paddle/fluid/operators/detection/bbox_util.h | 8 +- .../detection/generate_mask_labels_op.cc | 437 ++++++++++++++++++ .../detection/generate_proposal_labels_op.cc | 59 +-- paddle/fluid/operators/detection/mask_util.cc | 229 +++++++++ paddle/fluid/operators/detection/mask_util.h | 30 ++ .../operators/detection/mask_util_test.cc | 115 +++++ paddle/fluid/operators/gather_op.cc | 2 + paddle/fluid/operators/roi_align_op.cu | 47 +- paddle/fluid/operators/roi_pool_op.cu | 51 +- .../sigmoid_cross_entropy_with_logits_op.cc | 17 +- .../sigmoid_cross_entropy_with_logits_op.cu | 180 +++++++- .../sigmoid_cross_entropy_with_logits_op.h | 127 ++--- python/paddle/fluid/data_feeder.py | 4 +- python/paddle/fluid/layers/detection.py | 182 +++++++- python/paddle/fluid/layers/nn.py | 22 +- python/paddle/fluid/tests/test_detection.py | 144 ++++-- .../unittests/test_generate_mask_labels_op.py | 421 +++++++++++++++++ .../test_generate_proposal_labels_op.py | 4 +- .../unittests/test_generate_proposals_op.py | 4 +- ...st_sigmoid_cross_entropy_with_logits_op.py | 32 ++ 23 files changed, 1933 insertions(+), 204 deletions(-) create mode 100644 paddle/fluid/operators/detection/generate_mask_labels_op.cc create mode 100644 paddle/fluid/operators/detection/mask_util.cc create mode 100644 paddle/fluid/operators/detection/mask_util.h create mode 100644 paddle/fluid/operators/detection/mask_util_test.cc create mode 100644 python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index ad39542b4d..430882dee9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -197,7 +197,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -318,6 +318,7 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)) paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) +paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index 2bebdb345a..c054fdb1ba 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -83,7 +83,7 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel( T* dbias) { const int outer_size = C; const int inner_size = N * HxW; - typedef cub::BlockReduce BlockReduce; + typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage ds_storage; __shared__ typename BlockReduce::TempStorage db_storage; @@ -97,13 +97,16 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel( ds_sum += dy[index] * x[index]; db_sum += dy[index]; } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + __syncthreads(); + auto ds_out = + BlockReduce(ds_storage).Reduce(static_cast(ds_sum), cub::Sum()); + auto db_out = + BlockReduce(db_storage).Reduce(static_cast(db_sum), cub::Sum()); + __syncthreads(); if (threadIdx.x == 0) { - dscale[i] = ds_sum; - dbias[i] = db_sum; + dscale[i] = ds_out; + dbias[i] = db_out; } - __syncthreads(); } } diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 6c85f1577e..d3a61dc367 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -45,3 +45,7 @@ detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op foreach(src ${LOCAL_DETECTION_LIBS}) set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs") endforeach() + +cc_library(mask_util SRCS mask_util.cc DEPS memory) +cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util) +detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util) diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index 6abeca1da4..b99edb5bf0 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -1,13 +1,17 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #pragma once #include #include "paddle/fluid/framework/eigen.h" @@ -88,7 +92,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes, inter_w = std::max(x_max - x_min + 1, zero); inter_h = std::max(y_max - y_min + 1, zero); inter_area = inter_w * inter_h; - overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); + overlaps_et(i, j) = + (inter_area == 0.) ? 0 : inter_area / + (r_box_area + c_box_area - inter_area); } } } diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc new file mode 100644 index 0000000000..46727c29de --- /dev/null +++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc @@ -0,0 +1,437 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/detection/mask_util.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +const int kBoxDim = 4; + +template +void AppendMask(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +class GenerateMaskLabelsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("GtClasses"), + "Input(GtClasses) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(IsCrowd) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("GtSegms"), + "Input(GtSegms) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Rois"), "Input(Rois) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("LabelsInt32"), + "Input(LabelsInt32) shouldn't be null."); + + PADDLE_ENFORCE( + ctx->HasOutput("MaskRois"), + "Output(MaskRois) of GenerateMaskLabelsOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("RoiHasMaskInt32"), + "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("MaskInt32"), + "Output(MaskInt32) of GenerateMaskLabelsOp should not be null"); + + auto im_info_dims = ctx->GetInputDim("ImInfo"); + auto gt_segms_dims = ctx->GetInputDim("GtSegms"); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); + PADDLE_ENFORCE_EQ(gt_segms_dims.size(), 2, + "The rank of Input(GtSegms) must be 2."); + PADDLE_ENFORCE_EQ(gt_segms_dims[1], 2, + "The second dim of Input(GtSegms) must be 2."); + int num_classes = ctx->Attrs().Get("num_classes"); + int resolution = ctx->Attrs().Get("resolution"); + + ctx->SetOutputDim("MaskRois", {-1, 4}); + ctx->SetOutputDim("RoiHasMaskInt32", {-1, 1}); + ctx->SetOutputDim("MaskInt32", {-1, num_classes * resolution * resolution}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Rois")); + return framework::OpKernelType(data_type, platform::CPUPlace()); + } +}; + +/* + * Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2) + * to encode class specific mask targets. + */ +template +static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx, + const Tensor& masks, + const Tensor& mask_class_labels, + const int resolution, const int num_classes, + Tensor* mask_targets) { + const uint8_t* masks_data = masks.data(); + int64_t num_mask = masks.dims()[0]; + const int* mask_class_labels_data = mask_class_labels.data(); + const int M = resolution * resolution; + const int mask_dim = M * num_classes; + + int* mask_targets_data = + mask_targets->mutable_data({num_mask, mask_dim}, ctx.GetPlace()); + math::set_constant(ctx, mask_targets, -1); + for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) { + int cls = mask_class_labels_data[mask_id]; + int start = M * cls; + if (cls > 0) { + for (int i = 0; i < M; ++i) { + mask_targets_data[mask_id * mask_dim + start + i] = + static_cast(masks_data[mask_id * M + i]); + } + } + } +} + +template +std::vector SampleMaskForOneImage( + const platform::CPUDeviceContext& ctx, const Tensor& im_info, + const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_segms, + const Tensor& rois, const Tensor& label_int32, const int num_classes, + const int resolution, const framework::LoD& segm_length) { + // Prepare the mask targets by associating one gt mask to each training roi + // that has a fg (non-bg) class label. + const int64_t gt_size = static_cast(gt_classes.dims()[0]); + const int64_t roi_size = static_cast(rois.dims()[0]); + const int* gt_classes_data = gt_classes.data(); + const int* is_crowd_data = is_crowd.data(); + const int* label_int32_data = label_int32.data(); + PADDLE_ENFORCE_EQ(roi_size, label_int32.dims()[0]); + + std::vector mask_gt_inds, fg_inds; + std::vector>> gt_polys; + + auto polys_num = segm_length[1]; + auto segm_lod_offset = framework::ConvertToOffsetBasedLoD(segm_length); + auto lod1 = segm_lod_offset[1]; + auto lod2 = segm_lod_offset[2]; + const T* polys_data = gt_segms.data(); + for (int64_t i = 0; i < gt_size; ++i) { + if ((gt_classes_data[i] > 0) && (is_crowd_data[i] == 0)) { + mask_gt_inds.emplace_back(i); + + // slice fg segmentation polys + int poly_num = polys_num[i]; + std::vector> polys; + int s_idx = lod1[i]; + for (int j = 0; j < poly_num; ++j) { + int s = lod2[s_idx + j]; + int e = lod2[s_idx + j + 1]; + PADDLE_ENFORCE_NE(s, e); + std::vector plts(polys_data + s * 2, polys_data + e * 2); + polys.push_back(plts); + } + gt_polys.push_back(polys); + } + } + for (int64_t i = 0; i < roi_size; ++i) { + if (label_int32_data[i] > 0) { + fg_inds.emplace_back(i); + } + } + int gt_num = mask_gt_inds.size(); + int fg_num = fg_inds.size(); + + Tensor boxes_from_polys; + boxes_from_polys.mutable_data({gt_num, 4}, platform::CPUPlace()); + Poly2Boxes(gt_polys, boxes_from_polys.data()); + + std::vector roi_has_mask = + std::vector(fg_inds.begin(), fg_inds.end()); + Tensor mask_class_labels; + Tensor masks; + Tensor rois_fg; + + auto im_scale = im_info.data()[2]; + if (fg_num > 0) { + // Class labels for the foreground rois + mask_class_labels.mutable_data({fg_num, 1}, ctx.GetPlace()); + Gather(label_int32_data, 1, fg_inds.data(), fg_inds.size(), + mask_class_labels.data()); + + uint8_t* masks_data = masks.mutable_data( + {fg_num, resolution * resolution}, ctx.GetPlace()); + + // Find overlap between all foreground rois and the bounding boxes + // enclosing each segmentation + T* rois_fg_data = rois_fg.mutable_data({fg_num, 4}, ctx.GetPlace()); + Gather(rois.data(), 4, fg_inds.data(), fg_inds.size(), + rois_fg.data()); + + for (int k = 0; k < rois_fg.numel(); ++k) { + rois_fg_data[k] = rois_fg_data[k] / im_scale; + } + + Tensor overlaps_bbfg_bbpolys; + overlaps_bbfg_bbpolys.mutable_data({fg_num, gt_num}, ctx.GetPlace()); + BboxOverlaps(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys); + + // Map from each fg rois to the index of the mask with highest overlap + // (measured by bbox overlap) + T* overlaps_bbfg_bbpolys_data = overlaps_bbfg_bbpolys.data(); + std::vector fg_masks_inds; + for (int64_t i = 0; i < fg_num; ++i) { + const T* v = overlaps_bbfg_bbpolys_data + i * gt_num; + T max_overlap = std::numeric_limits::min(); + int id = 0; + for (int64_t j = 0; j < gt_num; ++j) { + if (v[j] > max_overlap) { + max_overlap = v[j]; + id = j; + } + } + fg_masks_inds.push_back(id); + } + + // add fg targets + for (int64_t i = 0; i < fg_num; ++i) { + int fg_polys_ind = fg_masks_inds[i]; + T* roi_fg = rois_fg_data + i * 4; + uint8_t* mask = masks_data + i * resolution * resolution; + Polys2MaskWrtBox(gt_polys[fg_polys_ind], roi_fg, resolution, mask); + } + } else { + // The network cannot handle empty blobs, so we must provide a mask + // We simply take the first bg roi, given it an all -1's mask (ignore + // label), and label it with class zero (bg). + int bg_num = 1; + T* rois_fg_data = rois_fg.mutable_data({bg_num, 4}, ctx.GetPlace()); + const T* rois_data = rois.data(); + std::vector bg_inds; + for (int64_t i = 0; i < roi_size; ++i) { + if (label_int32_data[i] == 0) { + bg_inds.emplace_back(i); + rois_fg_data[0] = rois_data[0] / im_scale; + rois_fg_data[1] = rois_data[1] / im_scale; + rois_fg_data[2] = rois_data[2] / im_scale; + rois_fg_data[3] = rois_data[3] / im_scale; + break; + } + } + masks.mutable_data({bg_num, resolution * resolution}, + ctx.GetPlace()); + math::set_constant(ctx, &masks, -1); + int* mask_class_labels_data = + mask_class_labels.mutable_data({bg_num, 1}, ctx.GetPlace()); + mask_class_labels_data[0] = 0; + roi_has_mask = std::vector(bg_inds.begin(), bg_inds.end()); + } + + Tensor masks_expand; + ExpandMaskTarget(ctx, masks, mask_class_labels, resolution, num_classes, + &masks_expand); + + T* rois_fg_data = rois_fg.data(); + for (int k = 0; k < rois_fg.numel(); ++k) { + rois_fg_data[k] = rois_fg_data[k] * im_scale; + } + + Tensor roi_has_mask_t; + int roi_has_mask_size = roi_has_mask.size(); + int* roi_has_mask_data = + roi_has_mask_t.mutable_data({roi_has_mask_size, 1}, ctx.GetPlace()); + std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data); + + std::vector res; + res.emplace_back(rois_fg); + res.emplace_back(roi_has_mask_t); + res.emplace_back(masks_expand); + return res; +} + +template +class GenerateMaskLabelsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* im_info = ctx.Input("ImInfo"); + auto* gt_classes = ctx.Input("GtClasses"); + auto* is_crowd = ctx.Input("IsCrowd"); + auto* gt_segms = ctx.Input("GtSegms"); + auto* rois = ctx.Input("Rois"); + auto* label_int32 = ctx.Input("LabelsInt32"); + + auto* mask_rois = ctx.Output("MaskRois"); + auto* roi_has_mask_int32 = ctx.Output("RoiHasMaskInt32"); + auto* mask_int32 = ctx.Output("MaskInt32"); + + int num_classes = ctx.Attr("num_classes"); + int resolution = ctx.Attr("resolution"); + + PADDLE_ENFORCE_EQ(gt_classes->lod().size(), 1UL, + "GenerateMaskLabelsOp gt_classes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "GenerateMaskLabelsOp is_crowd needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(rois->lod().size(), 1UL, + "GenerateMaskLabelsOp rois needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(label_int32->lod().size(), 1UL, + "GenerateMaskLabelsOp label_int32 needs 1 level of LoD"); + + PADDLE_ENFORCE_EQ(gt_segms->lod().size(), 3UL); + + int64_t n = static_cast(gt_classes->lod().back().size() - 1); + PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n); + + int mask_dim = num_classes * resolution * resolution; + + mask_rois->mutable_data({rois->numel(), kBoxDim}, ctx.GetPlace()); + roi_has_mask_int32->mutable_data({rois->numel(), 1}, ctx.GetPlace()); + mask_int32->mutable_data({rois->numel(), mask_dim}, ctx.GetPlace()); + + framework::LoD lod; + std::vector lod0(1, 0); + + int64_t num_mask = 0; + auto& dev_ctx = ctx.device_context(); + + auto gt_classes_lod = gt_classes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); + auto rois_lod = rois->lod().back(); + auto label_int32_lod = label_int32->lod().back(); + auto gt_segms_lod = gt_segms->lod(); + + for (int i = 0; i < n; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor gt_classes_slice = + gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor label_int32_slice = + label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]); + Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]); + + auto sub_lod_and_offset = + framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0); + auto lod_length = sub_lod_and_offset.first; + size_t s = sub_lod_and_offset.second.first; + size_t e = sub_lod_and_offset.second.second; + Tensor gt_segms_slice = gt_segms->Slice(s, e); + + std::vector tensor_output = SampleMaskForOneImage( + dev_ctx, im_info_slice, gt_classes_slice, is_crowd_slice, + gt_segms_slice, rois_slice, label_int32_slice, num_classes, + resolution, lod_length); + + Tensor sampled_mask_rois = tensor_output[0]; + Tensor sampled_roi_has_mask_int32 = tensor_output[1]; + Tensor sampled_mask_int32 = tensor_output[2]; + + AppendMask(mask_rois, kBoxDim * num_mask, &sampled_mask_rois); + AppendMask(roi_has_mask_int32, num_mask, + &sampled_roi_has_mask_int32); + AppendMask(mask_int32, mask_dim * num_mask, &sampled_mask_int32); + + num_mask += sampled_mask_rois.dims()[0]; + lod0.emplace_back(num_mask); + } + + lod.emplace_back(lod0); + mask_rois->set_lod(lod); + roi_has_mask_int32->set_lod(lod); + mask_int32->set_lod(lod); + mask_rois->Resize({num_mask, kBoxDim}); + roi_has_mask_int32->Resize({num_mask, 1}); + mask_int32->Resize({num_mask, mask_dim}); + } +}; + +class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("ImInfo", + "(Tensor), This input is a 2D Tensor with shape [B, 3]. " + "B is the number of input images, " + "each element consists of im_height, im_width, im_scale."); + AddInput("GtClasses", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. " + "M is the number of groundtruth, " + "each element is a class label of groundtruth."); + AddInput( + "IsCrowd", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. " + "M is the number of groundtruth, " + "each element is a flag indicates whether a groundtruth is crowd."); + AddInput( + "GtSegms", + "(LoDTensor), This input is a 2D LoDTensor with shape [S, 2], it's LoD " + "level is 3. The LoD[0] represents the gt objects number of each " + "instance. LoD[1] represents the segmentation counts of each objects. " + "LoD[2] represents the polygons number of each segmentation. S the " + "total number of polygons coordinate points. Each element is (x, y) " + "coordinate points."); + AddInput( + "Rois", + "(LoDTensor), This input is a 2D LoDTensor with shape [R, 4]. " + "R is the number of rois which is the output of " + "generate_proposal_labels, " + "each element is a bounding box with (xmin, ymin, xmax, ymax) format."); + AddInput("LabelsInt32", + "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], " + "each element repersents a class label of a roi"); + AddOutput( + "MaskRois", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. " + "P is the number of mask, " + "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); + AddOutput("RoiHasMaskInt32", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], " + "each element repersents the output mask rois index with regard " + "to input rois"); + AddOutput("MaskInt32", + "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], " + "Q equal to num_classes * resolution * resolution"); + + AddAttr("num_classes", "Class number."); + AddAttr("resolution", "Resolution of mask."); + + AddComment(R"DOC( +This operator can be, for given the RoIs and corresponding labels, +to sample foreground RoIs. This mask branch also has +a :math: `K \\times M^{2}` dimensional output targets for each foreground +RoI, which encodes K binary masks of resolution M x M, one for each of the +K classes. This mask targets are used to compute loss of mask branch. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(generate_mask_labels, ops::GenerateMaskLabelsOp, + ops::GenerateMaskLabelsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(generate_mask_labels, + ops::GenerateMaskLabelsKernel); diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index a652d4d957..5b2e571baf 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -48,20 +48,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Input(GtBoxes) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasOutput("Rois"), - "Output(Rois) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("Rois"), + "Output(Rois) of GenerateProposalLabelsOp should not be null"); PADDLE_ENFORCE( ctx->HasOutput("LabelsInt32"), - "Output(LabelsInt32) of RpnTargetAssignOp should not be null"); + "Output(LabelsInt32) of GenerateProposalLabelsOp should not be null"); PADDLE_ENFORCE( ctx->HasOutput("BboxTargets"), - "Output(BboxTargets) of RpnTargetAssignOp should not be null"); - PADDLE_ENFORCE( - ctx->HasOutput("BboxInsideWeights"), - "Output(BboxInsideWeights) of RpnTargetAssignOp should not be null"); - PADDLE_ENFORCE( - ctx->HasOutput("BboxOutsideWeights"), - "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null"); + "Output(BboxTargets) of GenerateProposalLabelsOp should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("BboxInsideWeights"), + "Output(BboxInsideWeights) of GenerateProposalLabelsOp " + "should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("BboxOutsideWeights"), + "Output(BboxOutsideWeights) of GenerateProposalLabelsOp " + "should not be null"); auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); @@ -225,30 +226,36 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, template std::vector SampleRoisForOneImage( - const platform::CPUDeviceContext& context, Tensor* rpn_rois, - Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info, - const int batch_size_per_im, const float fg_fraction, const float fg_thresh, - const float bg_thresh_hi, const float bg_thresh_lo, + const platform::CPUDeviceContext& context, const Tensor& rpn_rois_in, + const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_boxes, + const Tensor& im_info, const int batch_size_per_im, const float fg_fraction, + const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo, const std::vector& bbox_reg_weights, const int class_nums, std::minstd_rand engine, bool use_random) { - auto rpn_rois_et = framework::EigenTensor::From(*rpn_rois); - auto im_scale = im_info->data()[2]; - rpn_rois_et = rpn_rois_et / im_scale; + auto im_scale = im_info.data()[2]; + + Tensor rpn_rois; + rpn_rois.mutable_data(rpn_rois_in.dims(), context.GetPlace()); + T* rpn_rois_dt = rpn_rois.data(); + const T* rpn_rois_in_dt = rpn_rois_in.data(); + for (int i = 0; i < rpn_rois.numel(); ++i) { + rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale; + } Tensor boxes; - int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0]; + int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0]; boxes.mutable_data({proposals_num, kBoxDim}, context.GetPlace()); - Concat(context, *gt_boxes, *rpn_rois, &boxes); + Concat(context, gt_boxes, rpn_rois, &boxes); // Overlaps Tensor proposal_to_gt_overlaps; - proposal_to_gt_overlaps.mutable_data({proposals_num, gt_boxes->dims()[0]}, + proposal_to_gt_overlaps.mutable_data({proposals_num, gt_boxes.dims()[0]}, context.GetPlace()); - BboxOverlaps(boxes, *gt_boxes, &proposal_to_gt_overlaps); + BboxOverlaps(boxes, gt_boxes, &proposal_to_gt_overlaps); // Generate proposal index std::vector> fg_bg_gt = SampleFgBgGt( - context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im, + context, &proposal_to_gt_overlaps, is_crowd, batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random); std::vector fg_inds = fg_bg_gt[0]; std::vector bg_inds = fg_bg_gt[1]; @@ -263,7 +270,7 @@ std::vector SampleRoisForOneImage( sampled_boxes.mutable_data(bbox_dim, context.GetPlace()); sampled_labels.mutable_data({boxes_num}, context.GetPlace()); sampled_gts.mutable_data({fg_num, kBoxDim}, context.GetPlace()); - GatherBoxesLabels(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds, + GatherBoxesLabels(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds, gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts); // Compute targets @@ -397,8 +404,8 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); Tensor im_info_slice = im_info->Slice(i, i + 1); std::vector tensor_output = SampleRoisForOneImage( - dev_ctx, &rpn_rois_slice, >_classes_slice, &is_crowd_slice, - >_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction, + dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice, + gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine, use_random); Tensor sampled_rois = tensor_output[0]; @@ -467,7 +474,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { "P usuall equal to batch_size_per_im * batch_size, " "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); AddOutput("LabelsInt32", - "(LoDTensor), This output is a 2D LoDTensor with shape [P], " + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], " "each element repersents a class label of a roi"); AddOutput("BboxTargets", "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * " diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc new file mode 100644 index 0000000000..bd6fee7138 --- /dev/null +++ b/paddle/fluid/operators/detection/mask_util.cc @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/mask_util.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/memory/memory.h" + +namespace paddle { +namespace operators { + +uint32_t UMax(uint32_t a, uint32_t b) { return (a > b) ? a : b; } + +static inline int Compare(const void* a, const void* b) { + uint32_t c = *(reinterpret_cast(a)); + uint32_t d = *(reinterpret_cast(b)); + return c > d ? 1 : c < d ? -1 : 0; +} + +void Decode(const uint32_t* cnts, int m, uint8_t* mask) { + uint8_t v = 0; + for (int j = 0; j < m; j++) { + for (uint32_t k = 0; k < cnts[j]; k++) { + *(mask++) = v; + } + v = !v; + } +} + +typedef uint32_t uint; +void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) { + int j, m = 0; + double scale = 5; + int *x, *y, *u, *v; + uint *a, *b; + platform::CPUPlace cpu; + auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2); + x = reinterpret_cast(xptr->ptr()); + y = x + (k + 1); + + for (j = 0; j < k; j++) x[j] = static_cast(scale * xy[j * 2 + 0] + .5); + x[k] = x[0]; + for (j = 0; j < k; j++) y[j] = static_cast(scale * xy[j * 2 + 1] + .5); + y[k] = y[0]; + for (j = 0; j < k; j++) { + m += UMax(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1])) + 1; + } + auto vptr = memory::Alloc(cpu, sizeof(int) * m * 2); + u = reinterpret_cast(vptr->ptr()); + v = u + m; + m = 0; + for (j = 0; j < k; j++) { + int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx, dy, t, d; + int flip; + double s; + dx = abs(xe - xs); + dy = abs(ys - ye); + flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye); + if (flip) { + t = xs; + xs = xe; + xe = t; + t = ys; + ys = ye; + ye = t; + } + if (dx >= dy) { + s = dx == 0 ? 0 : static_cast(ye - ys) / dx; + for (d = 0; d <= dx; d++) { + t = flip ? dx - d : d; + u[m] = t + xs; + v[m] = static_cast(ys + s * t + .5); + m++; + } + } else { + s = dy == 0 ? 0 : static_cast(xe - xs) / dy; + for (d = 0; d <= dy; d++) { + t = flip ? dy - d : d; + v[m] = t + ys; + u[m] = static_cast(xs + s * t + .5); + m++; + } + } + } + /* get points along y-boundary and downsample */ + k = m; + m = 0; + double xd, yd; + auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2); + x = reinterpret_cast(xyptr->ptr()); + y = x + k; + for (j = 1; j < k; j++) { + if (u[j] != u[j - 1]) { + xd = static_cast(u[j] < u[j - 1] ? u[j] : u[j] - 1); + xd = (xd + .5) / scale - .5; + if (floor(xd) != xd || xd < 0 || xd > w - 1) continue; + yd = static_cast(v[j] < v[j - 1] ? v[j] : v[j - 1]); + yd = (yd + .5) / scale - .5; + if (yd < 0) + yd = 0; + else if (yd > h) + yd = h; + yd = ceil(yd); + x[m] = static_cast(xd); + y[m] = static_cast(yd); + m++; + } + } + /* compute rle encoding given y-boundary points */ + k = m; + auto aptr = memory::Alloc(cpu, sizeof(uint) * (k + 1)); + a = reinterpret_cast(aptr->ptr()); + for (j = 0; j < k; j++) a[j] = static_cast(x[j] * h + y[j]); + a[k++] = static_cast(h * w); + + qsort(a, k, sizeof(uint), Compare); + uint p = 0; + for (j = 0; j < k; j++) { + uint t = a[j]; + a[j] -= p; + p = t; + } + auto bptr = memory::Alloc(cpu, sizeof(uint32_t) * k); + b = reinterpret_cast(bptr->ptr()); + j = m = 0; + b[m++] = a[j++]; + while (j < k) { + if (a[j] > 0) { + b[m++] = a[j++]; + } else { + j++; + if (j < k) b[m - 1] += a[j++]; + } + } + + // convert to mask + auto mskptr = memory::Alloc(cpu, sizeof(uint8_t) * h * w); + uint8_t* msk = reinterpret_cast(mskptr->ptr()); + Decode(b, m, msk); + + for (int ii = 0; ii < h; ++ii) { + for (int jj = 0; jj < w; ++jj) { + mask[ii * w + jj] = msk[jj * h + ii]; + } + } +} + +void Poly2Boxes(const std::vector>>& polys, + float* boxes) { + // lists + for (size_t i = 0; i < polys.size(); ++i) { + float x0 = std::numeric_limits::max(); + float x1 = std::numeric_limits::min(); + float y0 = std::numeric_limits::max(); + float y1 = std::numeric_limits::min(); + // each list may have more than one polys + for (size_t j = 0; j < polys[i].size(); ++j) { + for (size_t k = 0; k < polys[i][j].size() / 2; ++k) { + x0 = std::min(x0, polys[i][j][2 * k]); + x1 = std::max(x1, polys[i][j][2 * k]); + y0 = std::min(y0, polys[i][j][2 * k + 1]); + y1 = std::max(y1, polys[i][j][2 * k + 1]); + } + } + boxes[i * 4] = x0; + boxes[i * 4 + 1] = y0; + boxes[i * 4 + 2] = x1; + boxes[i * 4 + 3] = y1; + } +} + +void Polys2MaskWrtBox(const std::vector>& polygons, + const float* box, int M, uint8_t* mask) { + float w = box[2] - box[0]; + float h = box[3] - box[1]; + w = std::max(w, static_cast(1.)); + h = std::max(h, static_cast(1.)); + + uint8_t* msk = nullptr; + if (polygons.size() == 1UL) { + msk = mask; + } else { + msk = reinterpret_cast( + malloc(M * M * polygons.size() * sizeof(uint8_t))); + } + for (size_t i = 0; i < polygons.size(); ++i) { + int k = polygons[i].size() / 2; + std::vector p; + for (int j = 0; j < k; ++j) { + float pw = (polygons[i][2 * j] - box[0]) * M / w; + float ph = (polygons[i][2 * j + 1] - box[1]) * M / h; + p.push_back(pw); + p.push_back(ph); + } + uint8_t* msk_i = msk + i * M * M; + Poly2Mask(p.data(), k, M, M, msk_i); + } + + if (polygons.size() > 1UL) { + for (size_t i = 0; i < polygons.size(); ++i) { + uint8_t* msk_i = msk + i * M * M; + for (int j = 0; j < M * M; ++j) { + if (i == 0) { + mask[j] = msk_i[j]; + } else { + mask[j] = (mask[j] + msk_i[j]) > 0 ? 1 : 0; + } + } + } + free(msk); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h new file mode 100644 index 0000000000..4e0ea54f6d --- /dev/null +++ b/paddle/fluid/operators/detection/mask_util.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace operators { + +void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask); + +void Poly2Boxes(const std::vector>>& polys, + float* boxes); + +void Polys2MaskWrtBox(const std::vector>& polygons, + const float* box, int M, uint8_t* mask); +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc new file mode 100644 index 0000000000..de904e9474 --- /dev/null +++ b/paddle/fluid/operators/detection/mask_util_test.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/mask_util.h" +#include +#include "paddle/fluid/memory/memory.h" + +namespace paddle { +namespace operators { + +template +void Compare(const T* a, const T* b, const int n) { + for (int i = 0; i < n; i++) { + EXPECT_EQ(a[i], b[i]); + } +} + +TEST(MaskUtil, Poly2MaskTest) { + float polys[] = {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, + 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}; + int h = 8, w = 8; + int k = 5; // length(polys) / 2 + // clang-format off + uint8_t expect_mask[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 1, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + // clang-format on + + // the groud-truth mask is computed by coco API: + // + // import pycocotools.mask as mask_util + // import numpy as np + // segm = [1.97, 1.88, 5.81, 1.88, 1.69, 6.53, 5.94, 6.38, 1.97, 1.88] + // rles = mask_util.frPyObjects([segm], im_h, im_w) + // mask = mask_util.decode(rles) + // print mask + platform::CPUPlace cpu; + auto allocation = memory::Alloc(cpu, sizeof(expect_mask)); + uint8_t* mask = reinterpret_cast(allocation->ptr()); + Poly2Mask(polys, k, h, w, mask); + Compare(expect_mask, mask, h * w); +} + +TEST(MaskUtil, Poly2BoxesTest) { + // clang-format off + std::vector>> polys = { + {{1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}}, + {{2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}} + }; + float expect_boxes[] = { + 1.69f, 1.88f, 5.94f, 6.53f, + 1.69f, 0.88f, 6.94f, 6.63f + }; + // clang-format on + + platform::CPUPlace cpu; + auto allocation = memory::Alloc(cpu, sizeof(expect_boxes)); + float* boxes = reinterpret_cast(allocation->ptr()); + Poly2Boxes(polys, boxes); + Compare(expect_boxes, boxes, 8); +} + +TEST(MaskUtil, Polys2MaskWrtBoxTest) { + // clang-format off + std::vector>> polys = {{ + {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}, + {2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}}; + float expect_boxes[] = { + 1.69f, 0.88f, 6.94f, 6.63f + }; + uint8_t expect_mask[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 1, 1, 1, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + // clang-format on + + platform::CPUPlace cpu; + auto allocation = memory::Alloc(cpu, sizeof(expect_boxes)); + float* boxes = reinterpret_cast(allocation->ptr()); + Poly2Boxes(polys, boxes); + Compare(expect_boxes, boxes, 4); + + auto allocat_mask = memory::Alloc(cpu, sizeof(expect_mask)); + uint8_t* mask = reinterpret_cast(allocat_mask->ptr()); + int M = 8; + Polys2MaskWrtBox(polys[0], expect_boxes, M, mask); + Compare(expect_mask, mask, M * M); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 0a8c0814a7..55cef93aac 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -103,8 +103,10 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, REGISTER_OPERATOR(gather_grad, ops::GatherGradOp); REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, + ops::GatherOpKernel, ops::GatherOpKernel); REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, + ops::GatherGradientOpKernel, ops::GatherGradientOpKernel); diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index bcec6f3563..8d695fdedd 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/roi_align_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -255,8 +256,8 @@ class GPUROIAlignOpKernel : public framework::OpKernel { Tensor roi_batch_id_list; roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(platform::CPUPlace()); + auto cplace = platform::CPUPlace(); + int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto rois_lod = rois->lod().back(); int rois_batch_size = rois_lod.size() - 1; PADDLE_ENFORCE_EQ( @@ -270,14 +271,18 @@ class GPUROIAlignOpKernel : public framework::OpKernel { roi_batch_id_data[i] = n; } } - Tensor roi_batch_id_list_gpu; - framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(), - &roi_batch_id_list_gpu); - GPUROIAlignForward< - T><<>>( + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = allocator.Allocate(bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, + dev_ctx.stream()); + GPUROIAlignForward<<>>( output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, sampling_ratio, - roi_batch_id_list_gpu.data(), + height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data, out->mutable_data(ctx.GetPlace())); } }; @@ -307,8 +312,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { } Tensor roi_batch_id_list; roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(platform::CPUPlace()); + auto cplace = platform::CPUPlace(); + int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto rois_lod = rois->lod().back(); int rois_batch_size = rois_lod.size() - 1; for (int n = 0; n < rois_batch_size; ++n) { @@ -316,24 +321,28 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { roi_batch_id_data[i] = n; } } - Tensor roi_batch_id_list_gpu; - framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(), - &roi_batch_id_list_gpu); - + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + auto roi_ptr = allocator.Allocate(roi_batch_id_list.numel() * sizeof(int)); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + int bytes = roi_batch_id_list.numel() * sizeof(int); + const auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, + dev_ctx.stream()); in_grad->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; - set_zero(ctx.cuda_device_context(), in_grad, static_cast(0)); + set_zero(dev_ctx, in_grad, static_cast(0)); int output_grad_size = out_grad->numel(); int blocks = NumBlocks(output_grad_size); int threads = kNumCUDAThreads; if (output_grad_size > 0) { - GPUROIAlignBackward< - T><<>>( + GPUROIAlignBackward<<>>( output_grad_size, rois->data(), out_grad->data(), rois_num, spatial_scale, channels, height, width, pooled_height, pooled_width, - sampling_ratio, roi_batch_id_list_gpu.data(), + sampling_ratio, roi_id_data, in_grad->mutable_data(ctx.GetPlace())); } } diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index 75c3dd6bc4..ac3a4201e6 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/roi_pool_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -152,8 +153,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel { framework::Tensor roi_batch_id_list; roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(platform::CPUPlace()); + auto cplace = platform::CPUPlace(); + int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto rois_lod = rois->lod().back(); int rois_batch_size = rois_lod.size() - 1; PADDLE_ENFORCE_EQ( @@ -168,15 +169,20 @@ class GPUROIPoolOpKernel : public framework::OpKernel { } } - framework::Tensor roi_batch_id_list_gpu; - framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &roi_batch_id_list_gpu); - - GPUROIPoolForward< - T><<>>( + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = allocator.Allocate(bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, + dev_ctx.stream()); + + GPUROIPoolForward<<>>( output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, - roi_batch_id_list_gpu.data(), out->mutable_data(ctx.GetPlace()), + height, width, pooled_height, pooled_width, roi_id_data, + out->mutable_data(ctx.GetPlace()), argmax->mutable_data(ctx.GetPlace())); } }; @@ -204,8 +210,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { if (x_grad) { framework::Tensor roi_batch_id_list; roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(platform::CPUPlace()); + auto cplace = platform::CPUPlace(); + int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); auto rois_lod = rois->lod().back(); int rois_batch_size = rois_lod.size() - 1; for (int n = 0; n < rois_batch_size; ++n) { @@ -213,25 +219,30 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { roi_batch_id_data[i] = n; } } - framework::Tensor roi_batch_id_list_gpu; - framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &roi_batch_id_list_gpu); + + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = allocator.Allocate(bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, + dev_ctx.stream()); x_grad->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; - set_zero(ctx.cuda_device_context(), x_grad, static_cast(0)); + set_zero(dev_ctx, x_grad, static_cast(0)); int output_grad_size = out_grad->numel(); int blocks = NumBlocks(output_grad_size); int threads = kNumCUDAThreads; if (output_grad_size > 0) { - GPUROIPoolBackward< - T><<>>( + GPUROIPoolBackward<<>>( output_grad_size, rois->data(), out_grad->data(), argmax->data(), rois_num, spatial_scale, channels, height, - width, pooled_height, pooled_width, - roi_batch_id_list_gpu.data(), + width, pooled_height, pooled_width, roi_id_data, x_grad->mutable_data(ctx.GetPlace())); } } diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 14746fa951..c21b0c13c7 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -101,6 +101,10 @@ class SigmoidCrossEntropyWithLogitsOpMaker AddOutput("Out", "(Tensor, default Tensor), a 2-D tensor with shape N x D " " of elementwise logistic losses."); + AddAttr("normalize", + "if true, divide the loss by the number of " + "targets != ignore_index.") + .SetDefault(false); AddAttr("ignore_index", "(int, default kIgnoreIndex), Specifies a target value that " "is ignored and" @@ -145,9 +149,14 @@ REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp); -REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CPUDeviceContext, float>); +REGISTER_OP_CPU_KERNEL( + sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsKernel, + ops::SigmoidCrossEntropyWithLogitsKernel); REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, float>); + paddle::platform::CPUDeviceContext, float>, + ops::SigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CPUDeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu index a1fbc7e5fa..2a4570ef5c 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -11,12 +11,184 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "cub/cub.cuh" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static HOSTDEVICE float real_exp(float x) { return expf(x); } +static HOSTDEVICE float real_exp(double x) { return exp(x); } +static HOSTDEVICE float real_log(float x) { return logf(x); } +static HOSTDEVICE float real_log(double x) { return log(x); } + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void GPUSigmoidForward(const T *x_data, const T *label_data, + const int ignore_index, const int limit, + T *out_data, T *counts) { + CUDA_1D_KERNEL_LOOP(i, limit) { + T x = x_data[i]; + T label = label_data[i]; + T eps = static_cast(1e-5); + T diff = label - static_cast(ignore_index); + if ((diff > -eps) && (diff < eps)) { + out_data[i] = static_cast(0.); + counts[i] = 0; + } else { + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = real_log(static_cast(1) + real_exp(static_cast(-abs(x)))); + out_data[i] = term1 - term2 + term3; + counts[i] = 1; + } + } +} + +template +__global__ void Sum(const T *counts, int num, const T eps, T *sum) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + T in = 0; + for (int i = threadIdx.x; i < num; i += BlockDim) { + in += counts[i]; + } + __syncthreads(); + auto out = + BlockReduce(temp_storage).Reduce(static_cast(in), cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + T a = out > eps ? out : eps; + sum[0] = a; + } +} + +template +__global__ void Div(T *loss, const int num, const T *norm) { + CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; } +} + +template +__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data, + const int ignore_index, const T *dout_data, + const int limit, T *dx_data, T *counts) { + CUDA_1D_KERNEL_LOOP(i, limit) { + T x = x_data[i]; + T label = label_data[i]; + T dout = dout_data[i]; + T eps = static_cast(1e-5); + T diff = label - static_cast(ignore_index); + if ((diff > -eps) && (diff < eps)) { + dx_data[i] = static_cast(0.); + counts[i] = 0; + } else { + T simoid_x = static_cast(1) / (static_cast(1) + real_exp(-x)); + T diff = simoid_x - label; + dx_data[i] = dout * diff; + counts[i] = 1; + } + } +} + +// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) +template +class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + Tensor *Out = context.Output("Out"); + int ignore_index = context.Attr("ignore_index"); + auto out_data = Out->mutable_data(context.GetPlace()); + + auto &dev_ctx = context.cuda_device_context(); + bool normalize = context.Attr("normalize"); + + // Temporary memory + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + auto cnt_ptr = allocator.Allocate(Labels->numel() * sizeof(T)); + T *counts = reinterpret_cast(cnt_ptr->ptr()); + + int limit = Out->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + GPUSigmoidForward<<>>( + X->data(), Labels->data(), ignore_index, limit, out_data, counts); + if (normalize) { + auto norm_ptr = allocator.Allocate(sizeof(T)); + T *norm = reinterpret_cast(norm_ptr->ptr()); + Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( + counts, limit, static_cast(1e-5), norm); + Div<<>>(out_data, limit, norm); + } + } +}; + +// dX = sigmoid(X) - labels +template +class GPUSigmoidCrossEntropyWithLogitsGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + Tensor *dX = context.Output(framework::GradVarName("X")); + auto dx_data = dX->mutable_data(context.GetPlace()); + + int ignore_index = context.Attr("ignore_index"); + + auto &dev_ctx = context.cuda_device_context(); + // Temporary memory + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + auto cnt_ptr = allocator.Allocate(X->numel() * sizeof(T)); + T *counts = reinterpret_cast(cnt_ptr->ptr()); + + int limit = dX->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + GPUSigmoidBackward<<>>( + X->data(), Labels->data(), ignore_index, dOut->data(), limit, + dx_data, counts); + bool normalize = context.Attr("normalize"); + if (normalize) { + auto norm_ptr = allocator.Allocate(sizeof(T)); + T *norm = reinterpret_cast(norm_ptr->ptr()); + Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( + counts, limit, static_cast(1e-5), norm); + Div<<>>(dx_data, limit, norm); + } + } +}; + +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, float>); + ops::GPUSigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CUDADeviceContext, float>, + ops::GPUSigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CUDADeviceContext, double>); REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, float>); + ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CUDADeviceContext, float>, + ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CUDADeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h index 6e75f9e0b8..8f459d573a 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -13,54 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/framework/eigen.h" +#include +#include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/hostdevice.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; -template -using EigenMatrix = framework::EigenMatrix; - -template -struct SigmoidCrossEntropyWithLogitsForward { - HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index) - : ignore_index(ignore_index) {} - - HOSTDEVICE T operator()(const T &x, const T &label) const { - if (static_cast(label) == ignore_index) { - return static_cast(0.); - } - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = std::log(static_cast(1) + std::exp(-(std::abs(x)))); - return term1 - term2 + term3; - } - - int ignore_index; -}; - -template -struct SigmoidCrossEntropyWithLogitsBackward { - HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index) - : ignore_index(ignore_index) {} - - HOSTDEVICE T operator()(const T &x, const T &label) const { - if (static_cast(label) == ignore_index) { - return static_cast(0.); - } - T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); - return simoid_x - label; - } - - int ignore_index; -}; // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) template @@ -70,16 +30,37 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { const Tensor *X = context.Input("X"); const Tensor *Labels = context.Input("Label"); Tensor *Out = context.Output("Out"); - Out->mutable_data(context.GetPlace()); int ignore_index = context.Attr("ignore_index"); - - auto x = EigenVector::Flatten(*X); - auto labels = EigenVector::Flatten(*Labels); - auto out = EigenVector::Flatten(*Out); - auto &place = *context.device_context().eigen_device(); - - out.device(place) = x.binaryExpr( - labels, SigmoidCrossEntropyWithLogitsForward(ignore_index)); + auto out_data = Out->mutable_data(context.GetPlace()); + int limit = Out->numel(); + auto x_data = X->data(); + auto label_data = Labels->data(); + for (int idx = 0; idx < limit; ++idx) { + T x = x_data[idx]; + T label = label_data[idx]; + if (static_cast(label) == ignore_index) { + out_data[idx] = static_cast(0.); + } else { + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = std::log(static_cast(1) + std::exp(-std::abs(x))); + out_data[idx] = term1 - term2 + term3; + } + } + bool normalize = context.Attr("normalize"); + if (normalize) { + int norm = 0; + T eps = static_cast(1e-6); + for (int idx = 0; idx < limit; ++idx) { + T diff = label_data[idx] - static_cast(ignore_index); + if ((diff < -eps) || (diff > eps)) { + norm += 1; + } + } + eps = static_cast(1e-5); + norm = norm > eps ? norm : eps; + std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; }); + } } }; @@ -92,19 +73,39 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { const Tensor *Labels = context.Input("Label"); const Tensor *dOut = context.Input(framework::GradVarName("Out")); Tensor *dX = context.Output(framework::GradVarName("X")); - dX->mutable_data(context.GetPlace()); - - auto ignore_index = context.Attr("ignore_index"); - auto x = EigenVector::Flatten(*X); - auto labels = EigenVector::Flatten(*Labels); - auto dout = EigenVector::Flatten(*dOut); - auto dx = EigenVector::Flatten(*dX); - auto &place = - *context.template device_context().eigen_device(); + auto dx_data = dX->mutable_data(context.GetPlace()); - auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward( - static_cast(ignore_index))); - dx.device(place) = dout * diff; + int ignore_index = context.Attr("ignore_index"); + int limit = dX->numel(); + auto x_data = X->data(); + auto label_data = Labels->data(); + auto dout_data = dOut->data(); + for (int idx = 0; idx < limit; ++idx) { + T x = x_data[idx]; + T label = label_data[idx]; + T dout = dout_data[idx]; + if (static_cast(label) == ignore_index) { + dx_data[idx] = static_cast(0.); + } else { + T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); + T diff = simoid_x - label; + dx_data[idx] = dout * diff; + } + } + bool normalize = context.Attr("normalize"); + if (normalize) { + int norm = 0; + T eps = static_cast(1e-6); + for (int idx = 0; idx < limit; ++idx) { + T diff = label_data[idx] - static_cast(ignore_index); + if ((diff < -eps) || (diff > eps)) { + norm += 1; + } + } + eps = static_cast(1e-5); + norm = norm > eps ? norm : eps; + std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; }); + } } }; diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 7b70d19de5..a24e1d1300 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -88,8 +88,8 @@ class DataToLoDTensorConverter(object): raise ValueError( "Reshape error. What is defined in data layer is {}, but receive {}" .format(self.shape, arr.shape)) - else: - self._check_shape(arr.shape) + #else: + # self._check_shape(arr.shape) t = core.LoDTensor() t.set(arr, self.place) if self.lod_level > 0: diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 8aed97dc59..cddc302d52 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -44,6 +44,7 @@ __all__ = [ 'roi_perspective_transform', 'generate_proposal_labels', 'generate_proposals', + 'generate_mask_labels', 'iou_similarity', 'box_coder', 'polygon_box_transform', @@ -1659,7 +1660,7 @@ def generate_proposal_labels(rpn_rois, class_nums=None, use_random=True): """ - ** Generate proposal labels Faster-RCNN ** + ** Generate Proposal Labels of Faster-RCNN ** This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth, to sample foreground boxes and background boxes, and compute loss target. @@ -1740,6 +1741,140 @@ def generate_proposal_labels(rpn_rois, return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights +def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois, + labels_int32, num_classes, resolution): + """ + ** Generate Mask Labels for Mask-RCNN ** + + This operator can be, for given the RoIs and corresponding labels, + to sample foreground RoIs. This mask branch also has + a :math: `K \\times M^{2}` dimensional output targets for each foreground + RoI, which encodes K binary masks of resolution M x M, one for each of the + K classes. This mask targets are used to compute loss of mask branch. + + Please note, the data format of groud-truth segmentation, assumed the + segmentations are as follows. The first instance has two gt objects. + The second instance has one gt object, this object has two gt segmentations. + + .. code-block:: python + + #[ + # [[[229.14, 370.9, 229.14, 370.9, ...]], + # [[343.7, 139.85, 349.01, 138.46, ...]]], # 0-th instance + # [[[500.0, 390.62, ...],[115.48, 187.86, ...]]] # 1-th instance + #] + + batch_masks = [] + for semgs in batch_semgs: + gt_masks = [] + for semg in semgs: + gt_segm = [] + for polys in semg: + gt_segm.append(np.array(polys).reshape(-1, 2)) + gt_masks.append(gt_segm) + batch_masks.append(gt_masks) + + + place = fluid.CPUPlace() + feeder = fluid.DataFeeder(place=place, feed_list=feeds) + feeder.feed(batch_masks) + + Args: + im_info(Variable): A 2-D Tensor with shape [N, 3]. N is the batch size, + each element is [height, width, scale] of image. Image scale is + target_size) / original_size. + gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the total + number of ground-truth, each element is a class label. + is_crowd(Variable): A 2-D LoDTensor with shape as gt_classes, + each element is a flag indicating whether a groundtruth is crowd. + gt_segms(Variable): This input is a 2D LoDTensor with shape [S, 2], + it's LoD level is 3. Usually users do not needs to understand LoD, + The users should return correct data format in reader. + + + + The LoD[0] represents the gt objects number of + each instance. LoD[1] represents the segmentation counts of each + objects. LoD[2] represents the polygons number of each segmentation. + S the total number of polygons coordinate points. Each element is + (x, y) coordinate points. + rois(Variable): A 2-D LoDTensor with shape [R, 4]. R is the total + number of RoIs, each element is a bounding box with + (xmin, ymin, xmax, ymax) format in the range of original image. + labels_int32(Variable): A 2-D LoDTensor in shape of [R, 1] with type + of int32. R is the same as it in `rois`. Each element repersents + a class label of a RoI. + num_classes(int): Class number. + resolution(int): Resolution of mask predictions. + + Returns: + mask_rois (Variable): A 2D LoDTensor with shape [P, 4]. P is the total + number of sampled RoIs. Each element is a bounding box with + [xmin, ymin, xmax, ymax] format in range of orignal image size. + mask_rois_has_mask_int32 (Variable): A 2D LoDTensor with shape [P, 1], + each element repersents the output mask RoI index with regard to + to input RoIs. + mask_int32 (Variable): A 2D LoDTensor with shape [P, K * M * M], + K is the classes number and M is the resolution of mask predictions. + Each element repersents the binary mask targets. + + Examples: + .. code-block:: python + + im_info = fluid.layers.data(name="im_info", shape=[3], + dtype="float32") + gt_classes = fluid.layers.data(name="gt_classes", shape=[1], + dtype="float32", lod_level=1) + is_crowd = fluid.layers.data(name="is_crowd", shape=[1], + dtype="float32", lod_level=1) + gt_masks = fluid.layers.data(name="gt_masks", shape=[2], + dtype="float32", lod_level=3) + # rois, labels_int32 can be the output of + # fluid.layers.generate_proposal_labels. + mask_rois, mask_index, mask_int32 = fluid.layers.generate_mask_labels( + im_info=im_info, + gt_classes=gt_classes, + is_crowd=is_crowd, + gt_segms=gt_masks, + rois=rois, + labels_int32=labels_int32, + num_classes=81, + resolution=14) + """ + + helper = LayerHelper('generate_mask_labels', **locals()) + + mask_rois = helper.create_variable_for_type_inference(dtype=rois.dtype) + roi_has_mask_int32 = helper.create_variable_for_type_inference( + dtype=gt_classes.dtype) + mask_int32 = helper.create_variable_for_type_inference( + dtype=gt_classes.dtype) + + helper.append_op( + type="generate_mask_labels", + inputs={ + 'ImInfo': im_info, + 'GtClasses': gt_classes, + 'IsCrowd': is_crowd, + 'GtSegms': gt_segms, + 'Rois': rois, + 'LabelsInt32': labels_int32 + }, + outputs={ + 'MaskRois': mask_rois, + 'RoiHasMaskInt32': roi_has_mask_int32, + 'MaskInt32': mask_int32 + }, + attrs={'num_classes': num_classes, + 'resolution': resolution}) + + mask_rois.stop_gradient = True + roi_has_mask_int32.stop_gradient = True + mask_int32.stop_gradient = True + + return mask_rois, roi_has_mask_int32, mask_int32 + + def generate_proposals(scores, bbox_deltas, im_info, @@ -1754,33 +1889,48 @@ def generate_proposals(scores, """ **Generate proposal Faster-RCNN** - This operation proposes RoIs according to each box with their probability to be a foreground object and - the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals + This operation proposes RoIs according to each box with their + probability to be a foreground object and + the box can be calculated by anchors. Bbox_deltais and scores + to be an object are the output of RPN. Final proposals could be used to train detection net. For generating proposals, this operation performs following steps: - 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) + 1. Transposes and resizes scores and bbox_deltas in size of + (H*W*A, 1) and (H*W*A, 4) 2. Calculate box locations as proposals candidates. 3. Clip boxes to image 4. Remove predicted boxes with small area. 5. Apply NMS to get final proposals as output. Args: - scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. - N is batch size, A is number of anchors, H and W are height and width of the feature map. - bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. - im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale + scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents + the probability for each box to be an object. + N is batch size, A is number of anchors, H and W are height and + width of the feature map. + bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] + represents the differece between predicted box locatoin and + anchor location. + im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin + image information for N batch. Info contains height, width and scale between origin image size and the size of feature map. - anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, - num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. - variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. - pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default. - post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default. + anchors(Variable): A 4-D Tensor represents the anchors with a layout + of [H, W, A, 4]. H and W are height and width of the feature map, + num_anchors is the box count of each position. Each anchor is + in (xmin, ymin, xmax, ymax) format an unnormalized. + variances(Variable): The expanded variances of anchors with a layout of + [H, W, num_priors, 4]. Each variance is in + (xcenter, ycenter, w, h) format. + pre_nms_top_n(float): Number of total bboxes to be kept per + image before NMS. 6000 by default. + post_nms_top_n(float): Number of total bboxes to be kept per + image after NMS. 1000 by default. nms_thresh(float): Threshold in NMS, 0.5 by default. - min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. - eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration. - + min_size(float): Remove predicted boxes with either height or + width < min_size. 0.1 by default. + eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, + adaptive_threshold = adaptive_threshold * eta in each iteration. """ helper = LayerHelper('generate_proposals', **locals()) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 503c91c27b..6765a89a1b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -8927,7 +8927,8 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): def sigmoid_cross_entropy_with_logits(x, label, ignore_index=kIgnoreIndex, - name=None): + name=None, + normalize=False): """ ${comment} @@ -8936,9 +8937,25 @@ def sigmoid_cross_entropy_with_logits(x, label(${label_type}): ${label_comment} ignore_index(&{ignore_index}): ${ignore_index_comment} name(basestring|None): Name of the output. + normalize(bool): If true, divide the output by the number of + targets != ignore_index. Returns: out(${out_type}): ${out_comment} + + Examples: + .. code-block:: python + + input = fluid.layers.data( + name='data', shape=[10], dtype='float32') + label = fluid.layers.data( + name='data', shape=[10], dtype='float32') + loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=input, + label=label, + ignore_index=-1, + normalize=True) # or False + # loss = fluid.layers.reduce_sum(loss) # summation of loss """ helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals()) @@ -8953,7 +8970,8 @@ def sigmoid_cross_entropy_with_logits(x, type="sigmoid_cross_entropy_with_logits", inputs={"X": x, "Label": label}, - attrs={"ignore_index": ignore_index}, + attrs={"ignore_index": ignore_index, + 'normalize': normalize}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index d99eaa0634..2d9ed9f9c6 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -203,7 +203,7 @@ class TestGenerateProposalLabels(unittest.TestCase): lod_level=1, append_batch_size=False) class_nums = 5 - rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( + outs = fluid.layers.generate_proposal_labels( rpn_rois=rpn_rois, gt_classes=gt_classes, is_crowd=is_crowd, @@ -216,6 +216,11 @@ class TestGenerateProposalLabels(unittest.TestCase): bg_thresh_lo=0.0, bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], class_nums=class_nums) + rois = outs[0] + labels_int32 = outs[1] + bbox_targets = outs[2] + bbox_inside_weights = outs[3] + bbox_outside_weights = outs[4] assert rois.shape[1] == 4 assert rois.shape[0] == labels_int32.shape[0] assert rois.shape[0] == bbox_targets.shape[0] @@ -226,6 +231,62 @@ class TestGenerateProposalLabels(unittest.TestCase): assert bbox_outside_weights.shape[1] == 4 * class_nums +class TestGenerateMaskLabels(unittest.TestCase): + def test_generate_mask_labels(self): + program = Program() + with program_guard(program): + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) + gt_classes = layers.data( + name='gt_classes', + shape=[2, 1], + dtype='int32', + lod_level=1, + append_batch_size=False) + is_crowd = layers.data( + name='is_crowd', + shape=[2, 1], + dtype='int32', + lod_level=1, + append_batch_size=False) + gt_segms = layers.data( + name='gt_segms', + shape=[20, 2], + dtype='float32', + lod_level=3, + append_batch_size=False) + rois = layers.data( + name='rois', + shape=[4, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + labels_int32 = layers.data( + name='labels_int32', + shape=[4, 1], + dtype='int32', + lod_level=1, + append_batch_size=False) + num_classes = 5 + resolution = 14 + outs = fluid.layers.generate_mask_labels( + im_info=im_info, + gt_classes=gt_classes, + is_crowd=is_crowd, + gt_segms=gt_segms, + rois=rois, + labels_int32=labels_int32, + num_classes=num_classes, + resolution=resolution) + mask_rois, roi_has_mask_int32, mask_int32 = outs + assert mask_rois.shape[1] == 4 + assert mask_int32.shape[1] == num_classes * resolution * resolution + + class TestMultiBoxHead(unittest.TestCase): def test_multi_box_head(self): data_shape = [3, 224, 224] @@ -313,7 +374,7 @@ class TestRpnTargetAssign(unittest.TestCase): name='gt_boxes', shape=[4], lod_level=1, dtype='float32') is_crowd = layers.data( name='is_crowd', - shape=[10], + shape=[1, 10], dtype='int32', lod_level=1, append_batch_size=False) @@ -323,7 +384,7 @@ class TestRpnTargetAssign(unittest.TestCase): dtype='float32', lod_level=1, append_batch_size=False) - pred_scores, pred_loc, tgt_lbl, tgt_bbox, bbox_inside_weight = layers.rpn_target_assign( + outs = layers.rpn_target_assign( bbox_pred=bbox_pred, cls_logits=cls_logits, anchor_box=anchor_box, @@ -337,6 +398,11 @@ class TestRpnTargetAssign(unittest.TestCase): rpn_positive_overlap=0.7, rpn_negative_overlap=0.3, use_random=False) + pred_scores = outs[0] + pred_loc = outs[1] + tgt_lbl = outs[2] + tgt_bbox = outs[3] + bbox_inside_weight = outs[4] self.assertIsNotNone(pred_scores) self.assertIsNotNone(pred_loc) @@ -351,41 +417,43 @@ class TestRpnTargetAssign(unittest.TestCase): class TestGenerateProposals(unittest.TestCase): def test_generate_proposals(self): - data_shape = [20, 64, 64] - images = fluid.layers.data( - name='images', shape=data_shape, dtype='float32') - im_info = fluid.layers.data( - name='im_info', shape=[1, 3], dtype='float32') - anchors, variances = fluid.layers.anchor_generator( - name='anchor_generator', - input=images, - anchor_sizes=[32, 64], - aspect_ratios=[1.0], - variance=[0.1, 0.1, 0.2, 0.2], - stride=[16.0, 16.0], - offset=0.5) - num_anchors = anchors.shape[2] - scores = fluid.layers.data( - name='scores', shape=[1, num_anchors, 8, 8], dtype='float32') - bbox_deltas = fluid.layers.data( - name='bbox_deltas', - shape=[1, num_anchors * 4, 8, 8], - dtype='float32') - rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals( - name='generate_proposals', - scores=scores, - bbox_deltas=bbox_deltas, - im_info=im_info, - anchors=anchors, - variances=variances, - pre_nms_top_n=6000, - post_nms_top_n=1000, - nms_thresh=0.5, - min_size=0.1, - eta=1.0) - self.assertIsNotNone(rpn_rois) - self.assertIsNotNone(rpn_roi_probs) - print(rpn_rois.shape) + program = Program() + with program_guard(program): + data_shape = [20, 64, 64] + images = fluid.layers.data( + name='images', shape=data_shape, dtype='float32') + im_info = fluid.layers.data( + name='im_info', shape=[3], dtype='float32') + anchors, variances = fluid.layers.anchor_generator( + name='anchor_generator', + input=images, + anchor_sizes=[32, 64], + aspect_ratios=[1.0], + variance=[0.1, 0.1, 0.2, 0.2], + stride=[16.0, 16.0], + offset=0.5) + num_anchors = anchors.shape[2] + scores = fluid.layers.data( + name='scores', shape=[num_anchors, 8, 8], dtype='float32') + bbox_deltas = fluid.layers.data( + name='bbox_deltas', + shape=[num_anchors * 4, 8, 8], + dtype='float32') + rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals( + name='generate_proposals', + scores=scores, + bbox_deltas=bbox_deltas, + im_info=im_info, + anchors=anchors, + variances=variances, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=0.5, + min_size=0.1, + eta=1.0) + self.assertIsNotNone(rpn_rois) + self.assertIsNotNone(rpn_roi_probs) + print(rpn_rois.shape) class TestYoloDetection(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py new file mode 100644 index 0000000000..1d7ce33ea7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py @@ -0,0 +1,421 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +import six +import paddle.fluid as fluid +from op_test import OpTest +''' +# Equivalent code +rles = mask_util.frPyObjects([segm], im_h, im_w) +mask = mask_util.decode(rles) +''' + + +def decode(cnts, m): + v = 0 + mask = [] + for j in range(m): + for k in range(cnts[j]): + mask.append(v) + v = 1 - v + return mask + + +def poly2mask(xy, k, h, w): + scale = 5. + x = [int(scale * p + 0.5) for p in xy[::2]] + x = x + [x[0]] + y = [int(scale * p + 0.5) for p in xy[1::2]] + y = y + [y[0]] + m = sum([ + int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + int(1) + for j in range(k) + ]) + + u, v = [], [] + for j in range(k): + xs = x[j] + xe = x[j + 1] + ys = y[j] + ye = y[j + 1] + dx = abs(xe - xs) + dy = abs(ys - ye) + flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye) + if flip: + xs, xe = xe, xs + ys, ye = ye, ys + + if dx >= dy: + if (dx == 0): assert ye - ys == 0 + s = 0 if dx == 0 else float(ye - ys) / dx + else: + if (dy == 0): assert xe - xs == 0 + s = 0 if dy == 0 else float(xe - xs) / dy + + if dx >= dy: + ts = [dx - d if flip else d for d in range(dx + 1)] + u.extend([xs + t for t in ts]) + v.extend([int(ys + s * t + .5) for t in ts]) + else: + ts = [dy - d if flip else d for d in range(dy + 1)] + v.extend([t + ys for t in ts]) + u.extend([int(xs + s * t + .5) for t in ts]) + + k = len(u) + x = np.zeros((k), np.int) + y = np.zeros((k), np.int) + m = 0 + for j in six.moves.xrange(1, k): + if u[j] != u[j - 1]: + xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1)) + xd = (xd + .5) / scale - .5 + if (math.floor(xd) != xd or xd < 0 or xd > (w - 1)): + continue + yd = float(v[j] if v[j] < v[j - 1] else v[j - 1]) + yd = (yd + .5) / scale - .5 + yd = math.ceil(0 if yd < 0 else (h if yd > h else yd)) + x[m] = int(xd) + y[m] = int(yd) + m += 1 + k = m + a = [int(x[i] * h + y[i]) for i in range(k)] + a.append(h * w) + a.sort() + b = [0] + a[:len(a) - 1] + a = [c - d for (c, d) in zip(a, b)] + + k += 1 + b = [0 for i in range(k)] + b[0] = a[0] + m, j = 1, 1 + while (j < k): + if a[j] > 0: + b[m] = a[j] + m += 1 + j += 1 + else: + j += 1 + if (j < k): + b[m - 1] += a[j] + j += 1 + mask = decode(b, m) + mask = np.array(mask, dtype=np.int).reshape((w, h)) + mask = mask.transpose((1, 0)) + return mask + + +def polys_to_boxes(polys): + """Convert a list of polygons into an array of tight bounding boxes.""" + boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32) + for i in range(len(polys)): + poly = polys[i] + x0 = min(min(p[::2]) for p in poly) + x1 = max(max(p[::2]) for p in poly) + y0 = min(min(p[1::2]) for p in poly) + y1 = max(max(p[1::2]) for p in poly) + boxes_from_polys[i, :] = [x0, y0, x1, y1] + return boxes_from_polys + + +def bbox_overlaps(boxes, query_boxes): + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + for k in range(K): + box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) *\ + (query_boxes[k, 3] - query_boxes[k, 1] + 1) + for n in range(N): + iw = min(boxes[n, 2], query_boxes[k, 2]) -\ + max(boxes[n, 0], query_boxes[k, 0]) + 1 + if iw > 0: + ih = min(boxes[n, 3], query_boxes[k, 3]) -\ + max(boxes[n, 1], query_boxes[k, 1]) + 1 + if ih > 0: + ua = float( + (boxes[n, 2] - boxes[n, 0] + 1) *\ + (boxes[n, 3] - boxes[n, 1] + 1) +\ + box_area - iw * ih) + overlaps[n, k] = iw * ih / ua + return overlaps + + +def polys_to_mask_wrt_box(polygons, box, M): + """Convert from the COCO polygon segmentation format to a binary mask + encoded as a 2D array of data type numpy.float32. The polygon segmentation + is understood to be enclosed in the given box and rasterized to an M x M + mask. The resulting mask is therefore of shape (M, M). + """ + w = box[2] - box[0] + h = box[3] - box[1] + + w = np.maximum(w, 1) + h = np.maximum(h, 1) + + polygons_norm = [] + for poly in polygons: + p = np.array(poly, dtype=np.float32) + p[0::2] = (p[0::2] - box[0]) * M / w + p[1::2] = (p[1::2] - box[1]) * M / h + polygons_norm.append(p) + + mask = [] + for polygons in polygons_norm: + assert polygons.shape[0] % 2 == 0 + k = polygons.shape[0] // 2 + mask.append(poly2mask(polygons, k, M, M)) + mask = np.array(mask) + # Flatten in case polygons was a list + mask = np.sum(mask, axis=0) + mask = np.array(mask > 0, dtype=np.float32) + return mask + + +def expand_mask_targets(masks, mask_class_labels, resolution, num_classes): + """Expand masks from shape (#masks, resolution ** 2) + to (#masks, #classes * resolution ** 2) to encode class + specific mask targets. + """ + assert masks.shape[0] == mask_class_labels.shape[0] + + # Target values of -1 are "don't care" / ignore labels + mask_targets = -np.ones( + (masks.shape[0], num_classes * resolution**2), dtype=np.int32) + for i in range(masks.shape[0]): + cls = int(mask_class_labels[i]) + start = resolution**2 * cls + end = start + resolution**2 + # Ignore background instance + # (only happens when there is no fg samples in an image) + if cls > 0: + mask_targets[i, start:end] = masks[i, :] + return mask_targets + + +def generate_mask_labels(num_classes, im_info, gt_classes, is_crowd, + label_int32, gt_polys, resolution, rois, roi_lod, + gt_lod): + mask_rois = [] + roi_has_mask_int32 = [] + mask_int32 = [] + new_lod = [] + for i in range(len(im_info)): + roi_s = roi_lod[i] + roi_e = roi_lod[i + 1] + gt_s = gt_lod[i] + gt_e = gt_lod[i + 1] + mask_blob = _sample_mask(num_classes, im_info[i], gt_classes[gt_s:gt_e], + is_crowd[gt_s:gt_e], label_int32[roi_s:roi_e], + gt_polys[i], resolution, rois[roi_s:roi_e]) + new_lod.append(mask_blob['mask_rois'].shape[0]) + mask_rois.append(mask_blob['mask_rois']) + roi_has_mask_int32.append(mask_blob['roi_has_mask_int32']) + mask_int32.append(mask_blob['mask_int32']) + return mask_rois, roi_has_mask_int32, mask_int32, new_lod + + +def _sample_mask( + num_classes, + im_info, + gt_classes, + is_crowd, + label_int32, + gt_polys, # [[[], []], []] + resolution, + rois): + mask_blob = {} + im_scale = im_info[2] + sample_boxes = rois + polys_gt_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0] + polys_gt = [gt_polys[i] for i in polys_gt_inds] + boxes_from_polys = polys_to_boxes(polys_gt) + + fg_inds = np.where(label_int32 > 0)[0] + roi_has_mask = fg_inds.copy() + if fg_inds.shape[0] > 0: + mask_class_labels = label_int32[fg_inds] + masks = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32) + rois_fg = sample_boxes[fg_inds] + overlaps_bbfg_bbpolys = bbox_overlaps( + rois_fg.astype(np.float32), boxes_from_polys.astype(np.float32)) + fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1) + for i in range(rois_fg.shape[0]): + fg_polys_ind = fg_polys_inds[i] + poly_gt = polys_gt[fg_polys_ind] + roi_fg = rois_fg[i] + mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution) + mask = np.array(mask > 0, dtype=np.int32) + masks[i, :] = np.reshape(mask, resolution**2) + else: + bg_inds = np.where(label_int32 == 0)[0] + rois_fg = sample_boxes[bg_inds[0]].reshape((1, -1)) + masks = -np.ones((1, resolution**2), dtype=np.int32) + mask_class_labels = np.zeros((1, )) + roi_has_mask = np.append(roi_has_mask, 0) + masks = expand_mask_targets(masks, mask_class_labels, resolution, + num_classes) + rois_fg *= im_scale + mask_blob['mask_rois'] = rois_fg + mask_blob['roi_has_mask_int32'] = roi_has_mask + mask_blob['mask_int32'] = masks + return mask_blob + + +def trans_lod(lod): + new_lod = [0] + for i in range(len(lod)): + new_lod.append(lod[i] + new_lod[i]) + return new_lod + + +class TestGenerateMaskLabels(OpTest): + def set_data(self): + self.init_test_case() + self.make_generate_proposal_labels_out() + self.generate_gt_polys() + self.generate_groundtruth() + self.init_test_output() + self.inputs = { + 'ImInfo': self.im_info, + 'GtClasses': (self.gt_classes.astype(np.int32), self.gt_lod), + 'IsCrowd': (self.is_crowd.astype(np.int32), self.gt_lod), + 'LabelsInt32': (self.label_int32.astype(np.int32), self.rois_lod), + 'GtSegms': (self.gt_polys.astype(np.float32), self.masks_lod), + 'Rois': (self.rois.astype(np.float32), self.rois_lod) + } + self.attrs = { + 'num_classes': self.num_classes, + 'resolution': self.resolution + } + self.outputs = { + 'MaskRois': (self.mask_rois, [self.new_lod]), + 'RoiHasMaskInt32': (self.roi_has_mask_int32, [self.new_lod]), + 'MaskInt32': (self.mask_int32, [self.new_lod]) + } + + def init_test_case(self): + self.num_classes = 81 + self.resolution = 14 + self.batch_size = 2 + self.batch_size_per_im = 64 + self.images_shape = [100, 200] + np.random.seed(0) + + def make_generate_proposal_labels_out(self): + rois = [] + self.rois_lod = [[]] + self.label_int32 = [] + for bno in range(self.batch_size): + self.rois_lod[0].append(self.batch_size_per_im) + for i in range(self.batch_size_per_im): + xywh = np.random.rand(4) + xy1 = xywh[0:2] * 2 + wh = xywh[2:4] * (self.images_shape[0] - xy1) + xy2 = xy1 + wh + roi = [xy1[0], xy1[1], xy2[0], xy2[1]] + rois.append(roi) + self.rois = np.array(rois).astype("float32") + for idx, roi_num in enumerate(self.rois_lod[0]): + for roi_id in range(roi_num): + class_id = np.random.random_integers(self.num_classes - 1) + if idx == 0: + # set an image with no foreground, to test the empty case + self.label_int32.append(0) + else: + self.label_int32.append(class_id) + label_np = np.array(self.label_int32) + self.label_int32 = label_np[:, np.newaxis] + + def generate_gt_polys(self): + h, w = self.images_shape[0:2] + self.gt_polys = [] + self.gt_polys_list = [] + max_gt = 4 + max_poly_num = 5 + min_poly_size = 4 + max_poly_size = 16 + lod0 = [] + lod1 = [] + lod2 = [] + for i in range(self.batch_size): + gt_num = np.random.randint(1, high=max_gt, size=1)[0] + lod0.append(gt_num) + ptss = [] + for i in range(gt_num): + poly_num = np.random.randint(1, max_poly_num, size=1)[0] + lod1.append(poly_num) + pts = [] + for j in range(poly_num): + poly_size = np.random.randint( + min_poly_size, max_poly_size, size=1)[0] + x = np.random.rand(poly_size, 1) * w + y = np.random.rand(poly_size, 1) * h + xy = np.concatenate((x, y), axis=1) + pts.append(xy.flatten().tolist()) + self.gt_polys.extend(xy.flatten().tolist()) + lod2.append(poly_size) + ptss.append(pts) + self.gt_polys_list.append(ptss) + self.masks_lod = [lod0, lod1, lod2] + self.gt_lod = [lod0] + self.gt_polys = np.array(self.gt_polys).astype('float32').reshape(-1, 2) + + def generate_groundtruth(self): + self.im_info = [] + self.gt_classes = [] + self.is_crowd = [] + for roi_num in self.gt_lod[0]: + self.im_info.append(self.images_shape + [1.0]) + for roi_id in range(roi_num): + class_id = np.random.random_integers(self.num_classes - 1) + self.gt_classes.append(class_id) + self.is_crowd.append(0) + self.im_info = np.array(self.im_info).astype(np.float32) + gt_classes_np = np.array(self.gt_classes) + self.gt_classes = gt_classes_np[:, np.newaxis] + is_crowd_np = np.array(self.is_crowd) + self.is_crowd = is_crowd_np[:, np.newaxis] + + def init_test_output(self): + roi_lod = trans_lod(self.rois_lod[0]) + gt_lod = trans_lod(self.gt_lod[0]) + outs = generate_mask_labels(self.num_classes, self.im_info, + self.gt_classes, self.is_crowd, + self.label_int32, self.gt_polys_list, + self.resolution, self.rois, roi_lod, gt_lod) + self.mask_rois = outs[0] + self.roi_has_mask_int32 = outs[1] + self.mask_int32 = outs[2] + self.new_lod = outs[3] + + self.mask_rois = np.vstack(self.mask_rois) + self.roi_has_mask_int32 = np.hstack(self.roi_has_mask_int32)[:, + np.newaxis] + self.mask_int32 = np.vstack(self.mask_int32) + + def setUp(self): + self.op_type = "generate_mask_labels" + self.set_data() + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py index 2d5cd3b24b..5f6328707f 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://w_idxw.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function + import unittest import numpy as np import sys diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py index 9340d55857..5ce405dcca 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://w_idxw.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function + import unittest import numpy as np import sys diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py index 41797a241c..ae1883f1f7 100644 --- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py +++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py @@ -18,6 +18,7 @@ import numpy as np from op_test import OpTest from scipy.special import logit from scipy.special import expit +import paddle.fluid.core as core import unittest @@ -117,5 +118,36 @@ class TestSigmoidCrossEntropyWithLogitsOp3(OpTest): self.check_grad(['X'], 'Out') +class TestSigmoidCrossEntropyWithNorm(OpTest): + def setUp(self): + self.op_type = "sigmoid_cross_entropy_with_logits" + batch_size = 64 + num_classes = 20 + ignore_index = -1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype("float32")), + 'Label': np.random.randint(-1, 2, (batch_size, num_classes)) + .astype("float32") + } + self.attrs = {'ignore_index': ignore_index, 'normalize': True} + sigmoid_X = expit(self.inputs['X']) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) + out = -term1 - term2 + out[np.where(self.inputs['Label'] == ignore_index)] = 0 + if self.attrs['normalize']: + out = out / float( + np.where(self.inputs['Label'] != ignore_index)[0].size) + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + if __name__ == '__main__': unittest.main() From e6218c1d7b8f60c56f70ecdda8f0a26ce2c690f3 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 23 Jan 2019 05:16:49 +0000 Subject: [PATCH 076/123] change the input to a smaller value test=develop --- paddle/fluid/inference/tests/api/tester_helper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 18ed717557..b1f7a3464a 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -183,7 +183,7 @@ void SetFakeImageInput(std::vector> *inputs, float *input_data = static_cast(input.data.data()); // fill input data, for profile easily, do not use random data here. for (size_t j = 0; j < len; ++j) { - *(input_data + j) = Random(0, 10.); + *(input_data + j) = Random(0.0, 1.0) / 10.; } } (*inputs).emplace_back(input_slots); From 119a3d4deb1164b6265ebdf6ed2e9951649aedfe Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 23 Jan 2019 14:08:08 +0800 Subject: [PATCH 077/123] update comment test=develop --- .../paddle/fluid/contrib/reader/ctr_reader.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py index cc10ab239b..44e8647f8c 100644 --- a/python/paddle/fluid/contrib/reader/ctr_reader.py +++ b/python/paddle/fluid/contrib/reader/ctr_reader.py @@ -88,11 +88,11 @@ def ctr_reader( dense_slot_index(list(int)): the index of dense slots sparse_slot_index(list(int)): the index of sparse slots capacity(int): The buffer capacity maintained by :code:`py_reader`. - thread_num(list|tuple): List of tuples which declaring data shapes. - batch_size(list|tuple): List of strs which declaring data type. - file_list(list|tuple): List of ints which declaring data lod_level. - slots(bool): slot id of all sparse feature - name(basestring): The prefix Python queue name and Reader name. None will + thread_num(int): the thread num to read files by cpp reader. + batch_size(int): batch size of data. + file_list(list(str)): List of file names that need to read. + slots(list(int64)): list of slot id. + name(string): The prefix Python queue name and Reader name. None will be generated automatically. Returns: @@ -100,7 +100,15 @@ def ctr_reader( Examples: - 1. The basic usage of :code:`py_reader` is as follows: + 1. The basic usage of :code:`ctr_reader` is as follows: + + .. code-block:: python + + py_reader = fluid.contrib.ctr_reader.ctr_reader( + feed_dict=datas, file_type='plain', file_format='csv', + file_list=file_list, dense_slot_indexs=[1, 2, 3, 4], sparse_slot_indexs=[], + capacity=64, thread_num=20, batch_size=1000, slots=[], name='ctr_reader') + """ if name is None: queue_name = unique_name('lod_tensor_blocking_queue') From 8b50ad80ff6934512d3959947ac1e71ea3fb9ea3 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 23 Jan 2019 15:13:22 +0800 Subject: [PATCH 078/123] checkpoint at distributed training (#14854) checkpoint for distributed training. --- .../operators/distributed/grpc/grpc_client.cc | 89 ++-- .../operators/distributed/grpc/grpc_client.h | 17 +- .../operators/distributed/grpc/grpc_server.cc | 59 ++- .../operators/distributed/grpc/grpc_service.h | 3 + .../operators/distributed/request_handler.h | 13 + .../distributed/request_handler_impl.cc | 30 +- .../distributed/request_handler_impl.h | 10 + .../fluid/operators/distributed/rpc_client.h | 7 + .../operators/distributed/send_recv.proto.in | 18 + .../distributed_ops/listen_and_serv_op.cc | 5 + .../distributed_ops/listen_and_serv_op.h | 3 +- .../operators/distributed_ops/recv_op.cc | 63 ++- paddle/fluid/platform/mkldnn_reuse.h | 4 +- python/paddle/fluid/framework.py | 15 +- python/paddle/fluid/io.py | 454 +++++++++++++----- .../fluid/tests/unittests/dist_save_load.py | 57 ++- .../fluid/tests/unittests/dist_simnet_bow.py | 13 +- .../fluid/tests/unittests/test_dist_base.py | 6 +- .../tests/unittests/test_dist_save_load.py | 73 ++- .../tests/unittests/test_dist_transpiler.py | 49 +- .../fluid/transpiler/distribute_transpiler.py | 414 ++++++++++++++-- 21 files changed, 1122 insertions(+), 280 deletions(-) diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc index 7875c16c3c..52310f8d04 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc @@ -74,7 +74,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); SendProcessor* s = new SendProcessor(ch); - const std::string method = "SendRPC"; + const std::string method = kSendRPC; VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); @@ -107,7 +107,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, void ProcGetResponse(const VarHandle& var_h, const ::grpc::ByteBuffer& ret_msg) { - VLOG(100) << "ProcGetResponse"; + VLOG(4) << "ProcGetResponse"; framework::Variable* outvar = nullptr; // get response's trainer_id is not used int trainer_id; @@ -127,59 +127,74 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, + return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, "/sendrecv.SendRecvService/GetVariable", time_out); } +VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out) { + std::string var_name_no_barrier = + string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); + + return _AsyncGetVar( + ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, + "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out); +} + VarHandlePtr GRPCClient::AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, + return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, "/sendrecv.SendRecvService/GetMonomerVariable", time_out); } -VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& rpc_path, - int64_t time_out) { +VarHandlePtr GRPCClient::_AsyncGetVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& method, + const std::string& var_name, const std::string& out_varname, + const std::string& rpc_path, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; + const std::string out_varname_val = out_varname; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); GetProcessor* s = new GetProcessor(ch); - const std::string method = "GetRPC"; - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] { - // prepare input - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_trainer_id(trainer_id_); - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); + framework::AsyncIO( + [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] { + // prepare input + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); + req.set_trainer_id(trainer_id_); + ::grpc::ByteBuffer buf; + RequestToByteBuffer(req, &buf); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - // stub context - s->response_call_back_ = ProcGetResponse; + // stub context + s->response_call_back_ = ProcGetResponse; - platform::RecordRPCEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method, p_ctx); - auto call = - s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + auto call = + s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); req_count_++; @@ -202,7 +217,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, const auto ch = GetChannel(ep_val); GetProcessor* s = new GetProcessor(ch); - const std::string method = "PrefetchRPC"; + const std::string method = kPrefetchRPC; VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); @@ -242,7 +257,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = "BatchBarrierRPC"; + const std::string method = kBatchBarrierRPC; VarHandlePtr h( new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); @@ -267,7 +282,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - const std::string method = "FetchBarrierRPC"; + const std::string method = kFetchBarrierRPC; VarHandlePtr h( new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); @@ -293,7 +308,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = "SendMonomerFetchBarrierRPC"; + const std::string method = kSendMonomerFetchBarrierRPC; VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); s->Prepare(h, time_out); @@ -320,7 +335,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = "SendCompleteRPC"; + const std::string method = kSendCompleteRPC; VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr)); s->Prepare(h, time_out); @@ -347,7 +362,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - const std::string method = "CheckPointNotifyRPC"; + const std::string method = kCheckPointNotifyRPC; VarHandlePtr h( new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr)); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h index fa77d21257..ce0d2152aa 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h @@ -186,8 +186,15 @@ class GRPCClient : public RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, @@ -228,11 +235,11 @@ class GRPCClient : public RPCClient { void Proceed(); std::shared_ptr GetChannel(const std::string& ep); - VarHandlePtr _AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, const std::string& rpc, - int64_t time_out); + VarHandlePtr _AsyncGetVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& method, + const std::string& var_name, const std::string& out_varname, + const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline); private: grpc::CompletionQueue cq_; diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc index 08f777e279..4a9c158cb0 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -136,17 +136,65 @@ class RequestGet final : public RequestBase { void Process() override { // proc request. std::string varname = request_.varname(); + std::string out_varname = request_.out_varname(); int trainer_id = request_.trainer_id(); - VLOG(4) << "RequestGet " << varname; + + VLOG(4) << "RequestGet " << out_varname << " from " << varname; auto scope = request_handler_->scope(); - auto invar = scope->FindVar(varname); + framework::Variable* invar = nullptr; framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); + request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); if (outvar) { - SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), + SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), + &reply_); + } + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; +}; + +class RequestGetNoBarrier final : public RequestBase { + public: + explicit RequestGetNoBarrier(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetVariableNoBarrier); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetNoBarrier() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + std::string out_varname = request_.out_varname(); + int trainer_id = request_.trainer_id(); + + VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname; + + auto scope = request_handler_->scope(); + framework::Variable* invar = nullptr; + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); + + if (outvar) { + SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), &reply_); } Finish(reply_, &responder_); @@ -460,6 +508,9 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, b = new RequestSend(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestGet) { b = new RequestGet(&service_, cq.get(), handler, req_id); + + } else if (rpc_name == kRequestGetNoBarrier) { + b = new RequestGetNoBarrier(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestGetMonomerVariable) { b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id, this); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h index 0b5c5151e6..2965fe4490 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h @@ -81,6 +81,7 @@ enum class GrpcMethod { kGetVariable, kPrefetchVariable, kCheckpointNotify, + kGetVariableNoBarrier, kGetMonomerVariable, kGetMonomerBarrier, }; @@ -94,6 +95,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/SendVariable"; case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kGetVariableNoBarrier: + return "/sendrecv.SendRecvService/GetVariableNoBarrier"; case GrpcMethod::kGetMonomerVariable: return "/sendrecv.SendRecvService/GetMonomerVariable"; case GrpcMethod::kGetMonomerBarrier: diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 62b24f150b..991158ac72 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -42,11 +42,24 @@ constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; +constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; + +constexpr char kSendRPC[] = "SendRPC"; +constexpr char kGetRPC[] = "GetRPC"; +constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC"; +constexpr char kGetMonomerRPC[] = "GetMonomerRPC"; +constexpr char kPrefetchRPC[] = "PrefetchRPC"; +constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC"; +constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; +constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; +constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; +constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" #define COMPLETE_MESSAGE "COMPLETE@RECV" +#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV" #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 9722f8c96e..913ae76b38 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/string/piece.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -81,7 +82,8 @@ bool RequestGetHandler::Handle(const std::string& varname, const int trainer_id, const std::string& out_var_name, const std::string& table_name) { - VLOG(4) << "RequestGetHandler:" << varname; + VLOG(4) << "RequestGetHandler:" << varname + << " out_var_name: " << out_var_name; if (sync_mode_) { if (varname == FETCH_BARRIER_MESSAGE) { @@ -112,6 +114,32 @@ bool RequestGetHandler::Handle(const std::string& varname, return true; } +bool RequestGetNoBarrierHandler::Handle(const std::string& varname, + framework::Scope* scope, + framework::Variable* invar, + framework::Variable** outvar, + const int trainer_id, + const std::string& out_var_name, + const std::string& table_name) { + VLOG(4) << "RequestGetNoBarrierHandler:" << varname + << " out_var_name: " << out_var_name; + + // get var from pserver immediately without barriers + string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE); + string::Piece var_name_piece = string::Piece(varname); + + if (string::Contains(var_name_piece, without_barrier_piece)) { + var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece); + VLOG(4) << "Get var " << var_name_piece << " with " + << WITHOUT_BARRIER_MESSAGE; + *outvar = scope_->FindVar(var_name_piece.ToString()); + return true; + } else { + PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE); + } + return true; +} + bool RequestPrefetchHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h index 5e0b25c5c2..f3c1b24526 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -67,6 +67,16 @@ class RequestGetHandler final : public RequestHandler { bool enable_dc_asgd_; }; +class RequestGetNoBarrierHandler final : public RequestHandler { + public: + RequestGetNoBarrierHandler() : RequestHandler(false) {} + virtual ~RequestGetNoBarrierHandler() {} + bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; +}; + static inline void BuildVar(const std::string& param_name, std::initializer_list arguments, paddle::framework::proto::OpDesc::Var* var) { diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index b668d86978..ea54e0c295 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -43,8 +43,15 @@ class RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index b39eef04d8..6303667884 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -17,8 +17,14 @@ package sendrecv; option cc_generic_services = @cc_generic_services@; service SendRecvService { + // For parameter server round-robin like hashing, do not split tensors. + // Send and recv only one tensor + // TODO(typhoonzero): add streaming API rpc SendVariable(VariableMessage) returns (VoidMessage) {} + // Argument VariableMessage for GetVariable should only contain varname. rpc GetVariable(VariableMessage) returns (VariableMessage) {} + rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {} + // pre-fetch variable by given variable name and Ids rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} @@ -27,12 +33,17 @@ service SendRecvService { rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} } +// It can be: LoDTensor、SelectedRows or NCCL_ID enum VarType { LOD_TENSOR = 0; SELECTED_ROWS = 1; NCCL_ID = 2; } +// VariableMessage is serialized paddle variable message. +// NOTICE(gongwb):don't modify this proto if you are not +// not familar with how we serialize in sendrecvop_utils.h +// and deserilize it in variable_response.h. message VariableMessage { enum Type { // Pod Types @@ -49,14 +60,21 @@ message VariableMessage { string varname = 1; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType VarType type = 2; + // bool persistable is not needed for sending. + // tensor info: Type data_type = 3; repeated int64 dims = 4; + // lod details: int64 lod_level = 5; repeated LodData lod = 6; + // selected_rows height, aka. original dim0 int64 slr_height = 7; + // tensor data bytes serialized = 8; + // selected_rows data bytes rows = 9; + // Look up table block execution output variable name. string out_varname = 10; // If 1, the ps server will start profiling, the ps // server stops profiling and generates a profile to /tmp/profile_ps_* diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 629f364d71..53968831ea 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -347,6 +347,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, new distributed::RequestPrefetchHandler(sync_mode)); request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler( sync_mode, checkpoint_block_id)); + request_get_no_barrier_handler_.reset( + new distributed::RequestGetNoBarrierHandler()); rpc_service_->RegisterRPC(distributed::kRequestSend, request_send_handler_.get(), @@ -359,6 +361,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, FLAGS_rpc_prefetch_thread_num); rpc_service_->RegisterRPC(distributed::kRequestCheckpoint, request_checkpoint_handler_.get()); + rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier, + request_get_no_barrier_handler_.get()); auto optimize_blocks = Attr>(kOptimizeBlocks); @@ -413,6 +417,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, f(request_get_handler_.get()); f(request_prefetch_handler_.get()); f(request_checkpoint_handler_.get()); + f(request_get_no_barrier_handler_.get()); // start the server listening after all member initialized. server_thread_.reset(new std::thread(RunServer, rpc_service_)); diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h index 9431978df8..f20442bad7 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h @@ -55,7 +55,6 @@ class ListenAndServOp : public framework::OperatorBase { const framework::VariableNameMap& inputs, const framework::VariableNameMap& outputs, const framework::AttributeMap& attrs); - virtual ~ListenAndServOp(); void RunSyncLoop(framework::Executor* executor, @@ -89,6 +88,8 @@ class ListenAndServOp : public framework::OperatorBase { mutable std::shared_ptr rpc_service_; mutable std::shared_ptr request_send_handler_; mutable std::shared_ptr request_get_handler_; + mutable std::shared_ptr + request_get_no_barrier_handler_; mutable std::shared_ptr request_prefetch_handler_; mutable std::shared_ptr diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 48065437e3..120c65f296 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -27,30 +27,50 @@ namespace operators { class RecvOp : public framework::OperatorBase { public: - RecvOp(const std::string& type, const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) + RecvOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - auto outs = Outputs("Out"); + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { std::vector epmap = Attr>("epmap"); + std::vector varnames = + Attr>("varnames"); int sync_mode = Attr("sync_mode"); + auto outs = Outputs("Out"); + bool with_barrier = Attr("with_barrier"); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(place); - distributed::RPCClient* rpc_client = + distributed::RPCClient *rpc_client = distributed::RPCClient::GetInstance( Attr("trainer_id")); - std::vector rets; - for (size_t i = 0; i < outs.size(); i++) { - VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; - rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i])); - } - if (sync_mode) { + if (with_barrier) { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVar"; + rets.push_back( + rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); + } + if (sync_mode) { + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } + } else { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVarNoBarrier"; + rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, + varname, outs[i])); + } for (size_t i = 0; i < rets.size(); i++) { PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } @@ -79,12 +99,23 @@ This operator can get variables from server side. "(int, default 0)" "sync recv or async recv.") .SetDefault(0); + AddAttr("with_barrier", + "(bool, default True) if with_barrier=False, will use " + "AsyncGetVarNoBarrier get variable from pserver immediately") + .SetDefault(true); + AddAttr>( + "varnames", + "(string vector, default {}) " + "sometimes we need to put received var in another name " + "for example: we need var named 'moment_1@127.0.0.1:1001', " + "and it real name on parameter server is 'moment_1'. ") + .SetDefault({}); } }; class RecvOpShapeInference : public framework::InferShapeBase { public: - void operator()(framework::InferShapeContext* ctx) const override {} + void operator()(framework::InferShapeContext *ctx) const override {} }; } // namespace operators diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index faac6a12c6..269280d604 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -365,7 +365,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { mem_fmt.ndims = axis.size(); for (unsigned int i = 0; i < nchw_tz.size(); ++i) { mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format, - // regardless physical layout) + // regardless physical layout) } mem_fmt.data_type = mkldnn_f32; mem_fmt.format = mkldnn_blocked; @@ -374,7 +374,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { for (int i = nchw_tz.size() - 1; i >= 0; --i) { mem_fmt.layout_desc.blocking.padding_dims[i] = nchw_tz[i]; // logical dimensions (nchw format, regardless physical - // layout) + // layout) mem_fmt.layout_desc.blocking.block_dims[i] = 1; mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride; diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index fc5e471ae3..22f505854e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1696,12 +1696,20 @@ class Program(object): self._current_role = core.op_proto_and_checker_maker.OpRole.Forward self._op_role_var = [] - # for distribute + # for distribute training + # _is_distributed = True if under distributed training self._is_distributed = False + # _is_chief = True if the trainer is the first one, usually No.0 self._is_chief = False - self._slice_vars_and_attrs = [] + # _parameters_on_pservers records all the parameters distributed on parameter servers. + self._parameters_on_pservers = None + # _endpoints is a list about parameter servers ip:port, such as ["ip:port","ip:port"] self._endpoints = [] + # if current role is parameter server, the _ps_endpoint is its "ip:port" + self._ps_endpoint = None + # trainers_endpoints, it is used for distribution. self._trainers_endpoints = [] + # the distributed lookup table names self._distributed_lookup_table = None @property @@ -2232,8 +2240,9 @@ class Program(object): "Program") self._is_distributed = other._is_distributed self._is_chief = other._is_chief - self._slice_vars_and_attrs = other._slice_vars_and_attrs + self._parameters_on_pservers = other._parameters_on_pservers self._endpoints = other._endpoints + self._ps_endpoint = other._ps_endpoint self._distributed_lookup_table = other._distributed_lookup_table def _copy_data_info_from(self, other): diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index e74a87fc68..6b1d4cc34f 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -19,6 +19,7 @@ import errno import time import shutil import six +from functools import reduce from paddle.fluid.executor import Executor from paddle.fluid.evaluator import Evaluator @@ -183,8 +184,6 @@ def save_vars(executor, # NOTE: don't save the variable which type is RAW if each_var.type == core.VarDesc.VarType.RAW: continue - if each_var.name == main_program._distributed_lookup_table: - continue new_var = _clone_var_in_block_(save_block, each_var) if filename is None: save_block.append_op( @@ -206,16 +205,6 @@ def save_vars(executor, outputs={}, attrs={'file_path': os.path.join(dirname, filename)}) - # if there is lookup table, the trainer 0 will notify all pserver to save. - if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: - lookup_table_filename = os.path.join(dirname, "__lookup_table__") - attrs = {} - attrs['epmap'] = main_program._endpoints - attrs['dir'] = lookup_table_filename - attrs['lookup_table'] = main_program._distributed_lookup_table - save_block.append_op( - type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) - executor.run(save_program) @@ -267,6 +256,186 @@ def save_params(executor, dirname, main_program=None, filename=None): filename=filename) +def _save_distributed_persistables(executor, dirname, main_program): + """ + save_persistables for distributed training. + the method will do things listed below: + 1.save part of persistable variables on trainer. + 2.receive "remote prefetch variables" from parameter servers and merge them. + 3.save "distributed lookup table" on parameter servers. + 4.receive "optimizer variables" from parameter servers and merge them. + + Args: + executor(Executor): The executor to run for saving parameters. + dirname(str): The saving directory path. + main_program(Program): The program whose parameters will be + saved. the main_program must be the trainer_program + get after transpiler. + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + t = distribute_transpiler.DistributeTranspiler() + t.transpile(...) + train_program = t.get_trainer_program() + _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program) + """ + + def __save_remote_params(executor, dirname, remote_params_map): + """ + recive params on pserver through rpc. + if the params are be sliced, will concat them to one, then save it. + """ + if not remote_params_map: + return + + prog = Program() + block = prog.global_block() + + # recv optimize vars from pserver + for name, remote_params in remote_params_map.items(): + origin_var = None + is_slice = False + slice_vars = [0] * len(remote_params) + slice_var_names = [""] * len(remote_params) + endpoints = [""] * len(remote_params) + + for idx, optimizer in enumerate(remote_params): + origin = optimizer.origin + slice = optimizer.slice + is_slice = optimizer.is_slice + block_id = optimizer.block_id + endpoint = optimizer.endpoint + + if idx == 0: + origin_var = block.create_var( + name=origin.name, + type=origin.type, + shape=origin.shape, + dtype=origin.dtype, + persistable=True) + + slice_var = block.create_var( + name="{}.slice.{}".format(slice.name, idx), + type=slice.type, + shape=slice.shape, + dtype=slice.dtype, + persistable=True) + + index = block_id if is_slice else idx + slice_vars[index] = slice_var + slice_var_names[index] = slice.name + endpoints[index] = endpoint + + if is_slice: + block.append_op( + type='recv', + inputs={"X": []}, + outputs={"Out": slice_vars}, + attrs={ + "epmap": endpoints, + "with_barrier": False, + "varnames": slice_var_names, + "sync_mode": True + }) + block.append_op( + type='concat', + inputs={'X': slice_vars}, + outputs={'Out': origin_var}, + attrs={}) + else: + block.append_op( + type='recv', + inputs={"X": []}, + outputs={"Out": [origin_var]}, + attrs={ + "epmap": endpoints[:1], + "with_barrier": False, + "varnames": slice_var_names, + "sync_mode": True + }) + block.append_op( + type='save', + inputs={'X': [origin_var]}, + outputs={}, + attrs={'file_path': os.path.join(dirname, origin_var.name)}) + block.append_op(type='delete_var', inputs={'X': slice_vars}) + executor.run(prog) + + def __save_distributed_lookup_tables(executor, dirname, + distributed_lookup_table, endpoints): + """ + because the distributed lookup table may too huge to merge and save at one place, + it will be saved at parameter server independent respectively. + + the save directory is dirname/"__lookup_table__". + + """ + prog = Program() + block = prog.global_block() + + # if there is lookup table, the trainer 0 will notify all pserver to save. + lookup_table_filename = os.path.join(dirname, "__lookup_table__") + attrs = {} + attrs['epmap'] = endpoints + attrs['dir'] = lookup_table_filename + attrs['lookup_table'] = distributed_lookup_table + block.append_op( + type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) + executor.run(prog) + + def __exclude_vars(exclude_var_names=[]): + def is_valid(var): + if var.name in exclude_var_names: + return False + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.READER: + return False + return var.persistable + + return is_valid + + if not isinstance(main_program, Program): + raise ValueError("'main_program' should be an instance of Program.") + + if not main_program._is_distributed: + raise ValueError( + "'_save_distributed_persistables' just be designed for distributed training." + ) + + remote_params_map = main_program._parameters_on_pservers.get_distributed_vars_by_vtypes( + ["Optimizer", "RemotePrefetch"], groupby=True) + + exclude_var_names = [] + if remote_params_map: + exclude_var_names.extend(remote_params_map.keys()) + + if main_program._distributed_lookup_table: + if isinstance(main_program._distributed_lookup_table, list): + exclude_var_names.extend(main_program._distributed_lookup_table) + else: + exclude_var_names.append(main_program._distributed_lookup_table) + + local_vars = list( + filter(__exclude_vars(exclude_var_names), main_program.list_vars())) + save_vars( + executor, main_program=main_program, dirname=dirname, vars=local_vars) + + if main_program._is_chief: + if remote_params_map: + __save_remote_params(executor, dirname, remote_params_map) + if main_program._distributed_lookup_table: + __save_distributed_lookup_tables( + executor, dirname, main_program._distributed_lookup_table, + main_program._endpoints) + + def save_persistables(executor, dirname, main_program=None, filename=None): """ This function filters out all variables with `persistable==True` from the @@ -301,13 +470,19 @@ def save_persistables(executor, dirname, main_program=None, filename=None): fluid.io.save_persistables(executor=exe, dirname=param_path, main_program=None) """ - save_vars( - executor, - dirname=dirname, - main_program=main_program, - vars=None, - predicate=is_persistable, - filename=filename) + + if main_program and main_program._is_distributed: + _save_distributed_persistables( + executor, dirname=dirname, main_program=main_program) + + else: + save_vars( + executor, + dirname=dirname, + main_program=main_program, + vars=None, + predicate=is_persistable, + filename=filename) def load_vars(executor, @@ -402,17 +577,11 @@ def load_vars(executor, if not isinstance(main_program, Program): raise TypeError("program should be as Program type or None") - load_slice_vars = [] - for each_var in main_program._slice_vars_and_attrs: - load_slice_vars.append(each_var[2].name) - load_var_map = {} for each_var in vars: assert isinstance(each_var, Variable) if each_var.type == core.VarDesc.VarType.RAW: continue - if each_var.name in load_slice_vars: - continue new_var = _clone_var_in_block_(load_block, each_var) if filename is None: load_block.append_op( @@ -435,10 +604,6 @@ def load_vars(executor, attrs={'file_path': os.path.join(dirname, filename)}) executor.run(load_prog) - # load slice vars on pserver, if have it. - _load_slice_up_vars(executor, dirname, - main_program._slice_vars_and_attrs) - def load_params(executor, dirname, main_program=None, filename=None): """ @@ -521,12 +686,134 @@ def load_persistables(executor, dirname, main_program=None, filename=None): fluid.io.load_persistables(executor=exe, dirname=param_path, main_program=None) """ - load_vars( - executor, - dirname=dirname, - main_program=main_program, - predicate=is_persistable, - filename=filename) + + if main_program and main_program._is_distributed: + _load_distributed_persistables( + executor, dirname=dirname, main_program=main_program) + else: + load_vars( + executor, + dirname=dirname, + main_program=main_program, + predicate=is_persistable, + filename=filename) + + +def _load_distributed_persistables(executor, dirname, main_program=None): + """ + customized load_persistables for distributed training. + it should be used on parameter server, + + Args: + executor(Executor): The executor to run for saving parameters. + dirname(str): The load directory path. + main_program(Program): The program whose parameters will be + loaded. the main_program must be the pserver_program + get after transpiler. + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + t = distribute_transpiler.DistributeTranspiler() + t.transpile(...) + pserver_prog = t.get_pserver_program(...) + _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog) + """ + + def __is_distributed_part_var(varname): + trainer_idx = varname.find(".trainer_") + block_idx = varname.find(".block") + return trainer_idx or block_idx + + def __load_persistable_vars(executor, dirname, need_load_vars): + load_prog = Program() + load_block = load_prog.global_block() + need_delete_vars = [] + + for param in need_load_vars: + origin_var = param.origin + slice_var = param.slice + is_slice = param.is_slice + offset = param.offset + + if is_slice: + origin = load_block.create_var( + name="{}.load".format(origin_var.name), + type=origin_var.type, + shape=origin_var.shape, + dtype=origin_var.dtype, + persistable=True) + + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [origin]}, + attrs={ + 'file_path': os.path.join(dirname, origin_var.name) + }) + + slice = load_block.create_var( + name=slice_var.name, + type=slice_var.type, + shape=slice_var.shape, + dtype=slice_var.dtype, + persistable=True) + + dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:]) + start = int(offset / dim1_flatten) + end = int(offset / dim1_flatten + slice.shape[0]) + + load_block.append_op( + type="slice", + inputs={'Input': origin}, + outputs={'Out': slice}, + attrs={'axes': [0], + 'starts': [start], + 'ends': [end]}) + + need_delete_vars.append(origin) + else: + origin = load_block.create_var( + name="{}".format(origin_var.name), + type=origin_var.type, + shape=origin_var.shape, + dtype=origin_var.dtype, + persistable=True) + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [origin]}, + attrs={ + 'file_path': os.path.join(dirname, origin_var.name) + }) + + load_block.append_op( + type='delete_var', + inputs={'X': need_delete_vars}, ) + + executor.run(load_prog) + + if not isinstance(main_program, Program): + raise ValueError("'main_program' should be an instance of Program.") + + if not main_program._is_distributed: + raise ValueError( + "'_load_distributed_persistables' just be designed for distributed training." + ) + + if not main_program._ps_endpoint: + raise ValueError( + "'_load_distributed_persistables' need current_endpoint set in DistributeTranspiler.transpile" + ) + + need_load_vars = main_program._parameters_on_pservers.get_distributed_vars_by_ep( + main_program._ps_endpoint) + __load_persistable_vars(executor, dirname, need_load_vars) def prepend_feed_ops(inference_program, @@ -795,52 +1082,6 @@ def load_inference_model(dirname, return [program, feed_target_names, fetch_targets] -def _save_lookup_tables_by_notify(executor, dirname, lookup_table, - pserver_endpoints): - """ - This function will send checkpoint notify message from Trainer 0 - to all the pservers. - The checkpoint notify message contains lookup table name, - the absolute path on pserver to save lookup_table. - - Args: - executor(Executor): The executor to run for send checkpoint notify. - dirname(str): The folder where to save. - lookup_table(string): the lookup table name, when use distribute - lookup table, we can get lookup table name by DistributeTranspiler. - table_name - ps_endpoint_list(list): the parameter server ip:port list. - when use distribute lookup table, we can get ps_endpoint_list by - distribute arguments. - Return: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - table_name = "share_w" - ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] - - _save_pserver_vars_by_notify(executor=exe, - dirname=param_path, lookup_table=table_name, - pserver_endpoints=ps_endpoints) - """ - - pserver_notify_program = Program() - pserver_notify_block = pserver_notify_program.global_block() - - attrs = {} - attrs['epmap'] = pserver_endpoints - attrs['dir'] = dirname - attrs['lookup_table'] = lookup_table - - pserver_notify_block.append_op( - type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) - executor.run(pserver_notify_program) - - def _endpoints_replacement(program, endpoints): ENDPOINT_MAP = "epmap" for op in program.global_block().ops: @@ -911,54 +1152,3 @@ def get_parameter_value_by_name(name, executor, program=None): program = default_main_program() var = program.global_block().var(name) return get_parameter_value(var, executor) - - -def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): - if not slice_vars_and_attrs: - return - - load_prog = Program() - load_block = load_prog.global_block() - need_delete_vars = [] - - for var_tuple in slice_vars_and_attrs: - orig_var = var_tuple[0] - start = var_tuple[1] - slice_var = var_tuple[2] - end = start + slice_var.shape[0] - - orig_var_name = orig_var.name - orig_var.name = "{}.origin".format(orig_var_name) - - clone_orig_var = load_block.create_var( - name=orig_var.name, - type=orig_var.type, - shape=orig_var.shape, - dtype=orig_var.dtype, - persistable=True) - - clone_slice_var = load_block.create_var( - name=slice_var.name, - type=slice_var.type, - shape=slice_var.shape, - dtype=slice_var.dtype, - persistable=True) - - load_block.append_op( - type='load', - inputs={}, - outputs={'Out': [clone_orig_var]}, - attrs={'file_path': os.path.join(dirname, orig_var_name)}) - load_block.append_op( - type="slice", - inputs={'Input': clone_orig_var}, - outputs={'Out': clone_slice_var}, - attrs={'axes': [0], - 'starts': [start], - 'ends': [end]}) - need_delete_vars.append(clone_orig_var) - - load_block.append_op( - type='delete_var', - inputs={'X': need_delete_vars}, ) - executor.run(load_prog) diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py index faec535042..f0f13a9d49 100644 --- a/python/paddle/fluid/tests/unittests/dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/dist_save_load.py @@ -80,7 +80,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): # NOTE: pserver should not call memory optimize t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, - args.trainers, args.sync_mode) + args.trainers, args.sync_mode, False, + args.current_endpoint) pserver_prog = t.get_pserver_program(args.current_endpoint) startup_prog = t.get_startup_program(args.current_endpoint, pserver_prog) @@ -93,7 +94,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): exe.run(startup_prog) if need_load and model_dir: - self._load_persistable_vars(exe, model_dir, startup_prog) + fluid.io.load_persistables(exe, model_dir, pserver_prog) + exe.run(pserver_prog) def run_trainer(self, args): @@ -158,19 +160,46 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): need_save = bool(int(os.getenv("SAVE", "0"))) model_dir = os.getenv("MODEL_DIR", "") - - if need_save: - for _ in six.moves.xrange(RUN_STEP): - loss, = exe.run(fetch_list=[avg_cost.name], - feed=feeder.feed(get_data())) - if need_save and model_dir: - io.save_persistables(startup_exe, model_dir, trainer_prog) - - var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor()) - if six.PY2: - print(pickle.dumps(np.ravel(var).tolist())) + save_mode = os.getenv("SAVE_MODE", "") + + if save_mode == "LOCAL": + if need_save: + for _ in six.moves.xrange(RUN_STEP): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + if need_save and model_dir: + io.save_persistables(startup_exe, model_dir, trainer_prog) + + var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor( + )) + if six.PY2: + print(pickle.dumps(np.ravel(var).tolist())) + else: + sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist())) + + elif save_mode == "DIST": + skip_steps = int(os.getenv("SKIP_STEPS")) + loss = None + if need_save: + for idx in six.moves.xrange(8): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + if need_save and model_dir and idx == skip_steps and args.trainer_id == 0: + io.save_persistables(startup_exe, model_dir, + trainer_prog) + else: + for idx in six.moves.xrange(8): + data = get_data() + if idx <= skip_steps: + continue + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(data)) + if six.PY2: + print(pickle.dumps(loss.tolist())) + else: + sys.stdout.buffer.write(pickle.dumps(loss.tolist())) else: - sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist())) + raise Exception("save_mode must be LOCAL or DIST") if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py index fac5e037a4..09afae6114 100644 --- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py @@ -75,9 +75,13 @@ def get_loss(cos_q_pt, cos_q_nt): return avg_cost -def get_optimizer(): - # SGD optimizer - optimizer = fluid.optimizer.SGD(learning_rate=base_lr) +def get_optimizer(op="sgd"): + if op.upper() == "sgd".upper(): + optimizer = fluid.optimizer.SGD(learning_rate=base_lr) + elif op.upper() == "adam".upper(): + optimizer = fluid.optimizer.Adam(learning_rate=base_lr) + else: + optimizer = fluid.optimizer.SGD(learning_rate=base_lr) return optimizer @@ -237,7 +241,8 @@ class TestDistSimnetBow2x2(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() # Optimization - opt = get_optimizer() + opt = os.getenv('OPTIMIZER', 'sgd') + opt = get_optimizer(opt) opt.minimize(avg_cost) # Reader diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 69a38618cd..e51ae1a944 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -43,7 +43,8 @@ class TestDistRunnerBase(object): pserver_endpoints, trainers, sync_mode, - dc_asgd=False): + dc_asgd=False, + current_endpoint=None): # NOTE: import fluid until runtime, or else forking processes will cause error. config = fluid.DistributeTranspilerConfig() config.enable_dc_asgd = dc_asgd @@ -53,7 +54,8 @@ class TestDistRunnerBase(object): program=main_program, pservers=pserver_endpoints, trainers=trainers, - sync_mode=sync_mode) + sync_mode=sync_mode, + current_endpoint=current_endpoint) return t def run_pserver(self, args): diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py index 4588ca7c17..e795bc410e 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py @@ -33,7 +33,6 @@ class TestDistSaveLoadDense2x2(TestDistBase): delta=1e-3, check_error_log=False, need_envs={}): - required_envs = { "PATH": os.getenv("PATH", ""), "PYTHONPATH": os.getenv("PYTHONPATH", ""), @@ -77,7 +76,77 @@ class TestDistSaveLoadDense2x2(TestDistBase): need_envs = { "IS_DISTRIBUTED": '0', "IS_SPARSE": '0', - 'IS_SELF_CONTAINED_LR': '1' + 'IS_SELF_CONTAINED_LR': '1', + 'SAVE_MODE': 'LOCAL', + } + self.check_with_place( + "dist_save_load.py", + delta=0, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "http_proxy": "" + } + + required_envs.update(need_envs) + + if check_error_log: + required_envs["GLOG_v"] = "3" + required_envs["GLOG_logtostderr"] = "1" + + model_dir = tempfile.mkdtemp() + + save_env = {} + save_env["SAVE_MODE"] = "DIST" + save_env["SAVE"] = "1" + save_env["MODEL_DIR"] = model_dir + save_env.update(required_envs) + + tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env, + check_error_log) + + load_env = {} + load_env["LOAD"] = "1" + load_env["MODEL_DIR"] = model_dir + load_env.update(required_envs) + tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env, + check_error_log) + + shutil.rmtree(model_dir) + + train0_1_np = np.array(tr0_var_1) + train1_1_np = np.array(tr1_var_1) + train0_2_np = np.array(tr0_var_2) + train1_2_np = np.array(tr1_var_2) + + self.assertAlmostEqual( + train0_1_np.all(), train0_2_np.all(), delta=delta) + self.assertAlmostEqual( + train1_1_np.all(), train1_2_np.all(), delta=delta) + + def test_dist(self): + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1', + 'SAVE_MODE': 'DIST', + 'OPTIMIZER': 'ADAM', + 'SKIP_STEPS': str(np.random.randint(2, 6)) } self.check_with_place( "dist_save_load.py", diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 3d1ce6b27c..3566fed215 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -741,21 +741,40 @@ class TestLoadSliceVar(TranspilerTest): pserver, _ = self.get_pserver(self.pserver1_ep) pserver2, _ = self.get_pserver(self.pserver2_ep) - self.assertTrue(pserver._slice_vars_and_attrs) - self.assertTrue(pserver2._slice_vars_and_attrs) - - for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)): - self.assertEqual(pserver._slice_vars_and_attrs[idx][0], - pserver2._slice_vars_and_attrs[idx][0]) - - total_numel = six.moves.reduce( - lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape) - self.assertEqual( - total_numel, - six.moves.reduce(lambda x, y: x * y, - pserver._slice_vars_and_attrs[idx][2].shape) + - six.moves.reduce(lambda x, y: x * y, - pserver2._slice_vars_and_attrs[idx][2].shape)) + vars_ps1 = pserver._parameters_on_pservers.get_distributed_vars_by_ep( + self.pserver1_ep) + vars_ps2 = pserver._parameters_on_pservers.get_distributed_vars_by_ep( + self.pserver2_ep) + + self.assertTrue(vars_ps1) + self.assertTrue(vars_ps2) + + for idx in six.moves.xrange(len(vars_ps1)): + total_numel = 0 + ps1_numel, ps2_numel = 0, 0 + + ps1_var = vars_ps1[idx] + + if not ps1_var.is_slice: + total_numel = six.moves.reduce(lambda x, y: x * y, + vars_ps1[idx].origin.shape) + ps1_numel = six.moves.reduce(lambda x, y: x * y, + vars_ps1[idx].slice.shape) + else: + ps2_var = None + for var in vars_ps2: + if var.origin.name == ps1_var.origin.name: + ps2_var = var + break + + total_numel = six.moves.reduce(lambda x, y: x * y, + ps1_var.origin.shape) + ps1_numel = six.moves.reduce(lambda x, y: x * y, + ps1_var.slice.shape) + ps2_numel = six.moves.reduce(lambda x, y: x * y, + ps2_var.slice.shape) + + self.assertEqual(total_numel, ps1_numel + ps2_numel) class TestNCCL2Transpile(TranspilerTest): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index ea5a4cf7cd..c61cb54e1f 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -39,7 +39,7 @@ from .ps_dispatcher import RoundRobin, PSDispatcher from .. import core, framework, unique_name from ..framework import Program, default_main_program, \ default_startup_program, Block, \ - Parameter, grad_var_name + Parameter, Variable, grad_var_name from .details import * from ..distribute_lookup_table import find_distributed_lookup_table from functools import reduce @@ -62,6 +62,260 @@ def log(*args): print(args) +class VarStruct(object): + """ + record part properties of a Variable in python. + """ + + def __init__(self, name, shape, dtype, type, lod_level, persistable): + self.name = name + self.shape = shape + self.dtype = dtype + self.type = type + self.lod_level = lod_level + self.persistable = persistable + + +class VarDistributed(object): + """ + a class to record the var distributed on parameter servers. + the class will record the relationship between origin var and slice var. + the slice var's properties, such as type/shape/offset/endpoint. + """ + + def __init__(self, + origin_var, + slice_var, + is_slice=None, + block_id=None, + offset=None, + vtype=None, + endpoint=None): + """ + Args: + origin_var(Variable|VarStruct): origin var properties + slice_var(Variable|VarStruct): slice var properties + is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. + block_id(int|None): the number about the slice var. + offset(int|None): if the slice var is sliced, offset is the numel before the var. + vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. + endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" + """ + + if isinstance(origin_var, Variable): + self.origin = self.__create_var_struct(origin_var) + else: + self.origin = origin_var + + if isinstance(slice_var, Variable): + self.slice = self.__create_var_struct(slice_var) + else: + self.slice = slice_var + + if self.equal(self.origin, self.slice): + self.is_slice = False + self.block_id = 0 + self.offset = 0 + else: + self.is_slice = True + self.block_id = 0 + self.offset = 0 + + if is_slice is not None: + self.is_slice = is_slice + if block_id is not None: + self.block_id = block_id + if offset is not None: + self.offset = offset + + self.vtype = vtype + self.endpoint = endpoint + + @staticmethod + def __create_var_struct(var): + return VarStruct(var.name, var.shape, var.dtype, var.type, + var.lod_level, var.persistable) + + @staticmethod + def equal(var1, var2): + """ + the two var is equal or not. + Returns: + bool: equal will return True else False + """ + assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct) + + return var1.name == var2.name and \ + var1.type == var2.type and \ + var1.shape == var2.shape and \ + var1.dtype == var2.dtype and \ + var1.lod_level == var2.lod_level and \ + var1.persistable == var2.persistable + + def __str__(self): + origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \ + format(i="{", e="}", name=self.origin.name, type=self.origin.type, + shape=self.origin.shape, dtype=self.origin.dtype) + + slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \ + ".slice({is_slice}).block({block_id}).offset({offset})". \ + format(i="{", e="}", name=self.slice.name, type=self.slice.type, + shape=self.slice.shape, dtype=self.slice.dtype, + is_slice=self.is_slice, block_id=self.block_id, offset=self.offset) + + return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format( + self.vtype, origin_var_str, slice_var_str, self.endpoint) + + +class VarsDistributed(object): + """ + a gather about VarDistributed with many methods to find distributed vars. + through the class, we can get overview about the distributed parameters on parameter servers. + this class may centralized and convenient for developer to manage and get variable's distribute. + other module can also use this to find variables such io.py. + """ + + def __init__(self): + self.distributed_vars = [] + + def add_distributed_var(self, + origin_var, + slice_var, + is_slice=None, + block_id=None, + offset=None, + vtype=None, + endpoint=None): + """ + add distributed var in this. + + Args: + origin_var(Variable|VarStruct): origin var properties + slice_var(Variable|VarStruct): slice var properties + is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. + block_id(int|None): the number about the slice var. + offset(int|None): if the slice var is sliced, offset is the numel before the var. + vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. + endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" + Returns: + None + """ + self.distributed_vars.append( + VarDistributed(origin_var, slice_var, is_slice, block_id, offset, + vtype, endpoint)) + + def get_distributed_var_by_slice(self, var_name): + """ + get distributed var by conditions. + + Args: + var_name(str): slice var name, such as "w.traier0.block1" + Returns: + VarDistributed: distributed var. + """ + for dist_var in self.distributed_vars: + if dist_var.slice.name == var_name: + return dist_var + return None + + @staticmethod + def equal(var1, var2): + """ + the two var is equal or not. + Returns: + bool: equal will return True else False + """ + return var1.name == var2.name and \ + var1.type == var2.type and \ + var1.shape == var2.shape and \ + var1.dtype == var2.dtype and \ + var1.lod_level == var2.lod_level and \ + var1.persistable == var2.persistable + + def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint): + """ + get distributed var by conditions. + + Args: + origin_var_name(str): + endpoint(str): the parameter endpoint, such as "127.0.0.1:1001" + Returns: + VarDistributed: distributed var. + """ + for dist_var in self.distributed_vars: + if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint: + return dist_var + return None + + def get_distributed_vars_by_vtypes(self, vtypes, groupby=False): + """ + get distributed vars by conditions. + + Args: + vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch" + groupby(bool|False): group by origin var or not. + + Returns: + list: distributed var list. + dict: distributed var map when groupby=True + """ + vtype_vars = [] + for var in self.distributed_vars: + if var.vtype in vtypes: + vtype_vars.append(var) + if not groupby: + return vtype_vars + + params_map = {} + for var in vtype_vars: + origin_var_name = var.origin.name + + if origin_var_name in params_map.keys(): + optimizers = params_map.get(origin_var_name) + else: + optimizers = [] + optimizers.append(var) + params_map[origin_var_name] = optimizers + return params_map + + def get_distributed_vars_by_ep(self, endpoint, vtype=None): + """ + get distributed vars by conditions. + + Args: + endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001" + vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch" + + Returns: + list: distributed var list. + """ + endpoint_vars = [] + for var in self.distributed_vars: + if var.endpoint == endpoint: + endpoint_vars.append(var) + if not vtype: + return endpoint_vars + + vtype_vars = [] + for var in endpoint_vars: + if var.vtype == vtype: + vtype_vars.append(var) + return vtype_vars + + def overview(self): + """ + get the overview string about all params on all parameter servers. + + Returns: + Str: overview string. + + """ + vars_str = [] + for var in self.distributed_vars: + vars_str.append(str(var)) + return "\n".join(vars_str) + + class VarBlock: def __init__(self, varname, offset, size): self.varname = varname @@ -223,16 +477,13 @@ class DistributeTranspiler(object): trainer_id, trainers, current_endpoint, - startup_program=None, - wait_port=True): + startup_program=None): if not startup_program: startup_program = default_startup_program() if trainer_id >= 0: worker_endpoints = trainers.split(",") # send NCCL_ID to others or recv from trainer 0 worker_endpoints.remove(current_endpoint) - if trainer_id == 0 and wait_port: - wait_server_ready(worker_endpoints) nccl_id_var = startup_program.global_block().create_var( name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) @@ -313,13 +564,11 @@ class DistributeTranspiler(object): if self.config.mode == "nccl2": assert (isinstance(trainers, str)) - self.origin_program._trainers_endpoints = trainers.split(",") self._transpile_nccl2( trainer_id, trainers, current_endpoint, - startup_program=startup_program, - wait_port=self.config.wait_port) + startup_program=startup_program) return self.trainer_num = trainers @@ -327,6 +576,7 @@ class DistributeTranspiler(object): self.trainer_id = trainer_id pserver_endpoints = pservers.split(",") self.pserver_endpoints = pserver_endpoints + self.vars_overview = VarsDistributed() self.optimize_ops, self.params_grads = self._get_optimize_pass() ps_dispatcher = self.config.split_method(self.pserver_endpoints) @@ -347,6 +597,7 @@ class DistributeTranspiler(object): # add distributed attrs to program self.origin_program._is_distributed = True self.origin_program._endpoints = self.pserver_endpoints + self.origin_program._ps_endpoint = current_endpoint self.origin_program._is_chief = self.trainer_id == 0 self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None @@ -454,6 +705,10 @@ class DistributeTranspiler(object): self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i]) self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) + distributed_var = self.vars_overview.get_distributed_var_by_slice( + recv_vars[i].name) + distributed_var.endpoint = ep + # step4: Concat the parameters splits together after recv. all_recv_outputs = [] for param_varname, splited_var in six.iteritems(self.param_var_mapping): @@ -480,6 +735,12 @@ class DistributeTranspiler(object): recv_op_role_var_name = splited_trainer_grad[0].name if param_varname in self.sparse_param_to_height_sections: + + for table_name in table_names: + distributed_var = self.vars_overview.get_distributed_var_by_slice( + table_name) + distributed_var.vtype = "RemotePrefetch" + height_sections = self.sparse_param_to_height_sections[ param_varname] self._update_remote_sparse_update_op( @@ -532,6 +793,9 @@ class DistributeTranspiler(object): pserver_endpoints) self._split_table_grad_and_add_send_vars(program, pserver_endpoints) + self._get_distributed_optimizer_vars() + self.origin_program._parameters_on_pservers = self.vars_overview + def get_trainer_program(self, wait_port=True): """ Get transpiled trainer side program. @@ -541,6 +805,7 @@ class DistributeTranspiler(object): """ # remove optimize ops and add a send op to main_program # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay? + lr_ops = self._get_lr_ops() delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), lr_ops) @@ -665,9 +930,14 @@ class DistributeTranspiler(object): # NOTE: assume blocks of the same variable is not distributed # on the same pserver, only change param/grad varnames for # trainers to fetch. + sys.stderr.write( + "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n" + ) # step1 pserver_program = Program() pserver_program.random_seed = self.origin_program.random_seed + pserver_program._copy_dist_param_info_from(self.origin_program) + # step2: Create vars to receive vars at parameter servers. recv_inputs = [] for v in self.param_grad_ep_mapping[endpoint]["params"]: @@ -703,9 +973,6 @@ class DistributeTranspiler(object): else: recv_inputs.append(single_trainer_var) - self._slice_params_and_optimizes = self._get_slice_vars_and_attrs( - endpoint) - # step 3 # Create a union-find data structure from optimize ops, # If two ops are connected, we could add these two ops @@ -882,10 +1149,6 @@ class DistributeTranspiler(object): outputs={}, attrs=attrs) - # add distributed attrs - pserver_program._slice_vars_and_attrs = list( - self._slice_params_and_optimizes.values()) - pserver_program._sync_with_cpp() # save pserver program to generate pserver side startup relatively. self.pserver_program = pserver_program @@ -984,30 +1247,88 @@ class DistributeTranspiler(object): inputs={"X": startup_param_var}, outputs={"Out": startup_tmpvar}) - # add slice vars - s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs - return s_prog - def _get_slice_vars_and_attrs(self, endpoint): - slice_vars_and_attrs = {} + # ====================== private transpiler functions ===================== + def _get_slice_var_info(self, slice_var): block_suffix = "block" - for param in self.param_grad_ep_mapping[endpoint]["params"]: - orig_var_name, block_name, _ = self._get_varname_parts(param.name) - if not block_name: - continue + block_idx = 0 + offset = 0 + is_slice = False - block_idx = int(block_name.split(block_suffix)[1]) - orig_var = self.origin_program.global_block().vars[orig_var_name] + orig_var_name, block_name, _ = self._get_varname_parts(slice_var.name) - skip_dim0 = 0 - slice_vars = self.param_var_mapping[orig_var_name] - for slice_var in slice_vars[:block_idx]: - skip_dim0 += slice_var.shape[0] - slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param] - return slice_vars_and_attrs + if not block_name: + return is_slice, block_idx, offset - # ====================== private transpiler functions ===================== + block_idx = int(block_name.split(block_suffix)[1]) + skip_dim0 = 0 + slice_vars = self.param_var_mapping[orig_var_name] + + orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:]) + + for slice_var in slice_vars[:block_idx]: + skip_dim0 += slice_var.shape[0] + + offset = skip_dim0 * orig_dim1_flatten + is_slice = True + return is_slice, block_idx, offset + + def _get_distributed_optimizer_vars(self): + def _get_distributed_optimizer_var(endpoint): + opt_op_on_pserver = [] + for _, op in enumerate(self.optimize_ops): + if self._is_optimizer_op(op) and self._is_opt_op_on_pserver( + endpoint, op): + opt_op_on_pserver.append(op) + + for opt_op in opt_op_on_pserver: + dist_var = None + for key in opt_op.input_names: + if key == "Param": + param_name = opt_op.input(key)[0] + dist_var = self.vars_overview.get_distributed_var_by_origin_and_ep( + param_name, endpoint) + break + for key in opt_op.input_names: + if key in ["Param", "Grad", "LearningRate"]: + continue + origin_var = self.origin_program.global_block().vars[ + opt_op.input(key)[0]] + # update accumulator variable shape + new_shape = self._get_optimizer_input_shape( + opt_op.type, key, origin_var.shape, + dist_var.slice.shape) + + if new_shape == dist_var.slice.shape: + splited_var = VarStruct( + name=origin_var.name, + shape=new_shape, + dtype=origin_var.dtype, + type=origin_var.type, + lod_level=origin_var.lod_level, + persistable=origin_var.persistable) + + self.vars_overview.add_distributed_var( + origin_var=origin_var, + slice_var=splited_var, + is_slice=dist_var.is_slice, + block_id=dist_var.block_id, + offset=dist_var.offset, + vtype="Optimizer", + endpoint=endpoint) + else: + self.vars_overview.add_distributed_var( + origin_var=origin_var, + slice_var=origin_var, + is_slice=False, + block_id=0, + offset=0, + vtype="Optimizer", + endpoint=endpoint) + + for ep in self.pserver_endpoints: + _get_distributed_optimizer_var(ep) def _update_dist_lookup_table_vars(self, param_list, grad_list, params_grads): @@ -1093,6 +1414,22 @@ class DistributeTranspiler(object): # origin_param_name -> [splited_param_vars] self.param_var_mapping = self._create_vars_from_blocklist( self.origin_program, param_blocks) + + for orig_name, splited_vars in self.param_var_mapping.items(): + orig_var = self.origin_program.global_block().var(orig_name) + + for splited_var in splited_vars: + is_slice, block_id, offset = self._get_slice_var_info( + splited_var) + + self.vars_overview.add_distributed_var( + origin_var=orig_var, + slice_var=splited_var, + block_id=block_id, + offset=offset, + is_slice=is_slice, + vtype="Param") + # origin_grad_name -> [splited_grad_vars] self.grad_var_mapping = self._create_vars_from_blocklist( self.origin_program, @@ -1729,13 +2066,6 @@ class DistributeTranspiler(object): shape=new_shape) new_inputs[key] = tmpvar - # var shape been changed - if new_shape != var.shape: - slice_var_args = self._slice_params_and_optimizes[ - param_var.name] - self._slice_params_and_optimizes[ - var.name] = [var, slice_var_args[1], tmpvar] - # change output's ParamOut variable outputs = self._get_output_map_from_op( self.origin_program.global_block().vars, opt_op) @@ -1763,8 +2093,8 @@ class DistributeTranspiler(object): # skip per trainer vars if g.name.find(".trainer_") == -1: # only param or grads have splited blocks - if self._orig_varname(g.name) in self.grad_name_to_param_name or\ - self._orig_varname(g.name) in self.param_name_to_grad_name: + if self._orig_varname(g.name) in self.grad_name_to_param_name or \ + self._orig_varname(g.name) in self.param_name_to_grad_name: grad_block = g break return grad_block From dbdaf15ca0c0d4fb5264015b4621434ffc36063f Mon Sep 17 00:00:00 2001 From: guomingz Date: Wed, 23 Jan 2019 16:50:38 +0800 Subject: [PATCH 079/123] [V1.3] Add the calibration tool code for int8 inference and focus test. (#15062) * Add the calibration tool code for int8 inference and focus test. * Fix the calibration tool per the review comments. test=develop * Update the calibrator doc and remove extra line. * Fix the invalid is_negative_input attr set on Mobilenet. * Add the comments and fix the format issue. test=develop * Update the CMakelist.txt for Calibration PR.Disable the Calibration UT if not enable MKLDNN. test=develop * Update the CMakeList.txt. test=develop * Disable the test_calibration case on WIN and MAC. test=develop * Add the missing brackets. test=develop * Remove the outdated map operator which not supported on Python3. test=develop * Fix the style issue. test=develop * 1.Update the CMakeList.txt to disable calibration tool ut when the WITH_MKL is not set; 2.Add the workaround to enable the FLAGS_use_mkldnn for PR_CI(PADDLE). test=develop * Fix the typo and format the License header. test=develop * 1.Add and Update TODOs per review comments. 2.Code clean. test=develop --- .../fluid/contrib/int8_inference/__init__.py | 13 + .../fluid/contrib/int8_inference/utility.py | 708 ++++++++++++++++++ .../paddle/fluid/contrib/tests/CMakeLists.txt | 4 + .../fluid/contrib/tests/test_calibration.py | 230 ++++++ 4 files changed, 955 insertions(+) create mode 100644 python/paddle/fluid/contrib/int8_inference/__init__.py create mode 100644 python/paddle/fluid/contrib/int8_inference/utility.py create mode 100644 python/paddle/fluid/contrib/tests/test_calibration.py diff --git a/python/paddle/fluid/contrib/int8_inference/__init__.py b/python/paddle/fluid/contrib/int8_inference/__init__.py new file mode 100644 index 0000000000..eca2dce114 --- /dev/null +++ b/python/paddle/fluid/contrib/int8_inference/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/contrib/int8_inference/utility.py b/python/paddle/fluid/contrib/int8_inference/utility.py new file mode 100644 index 0000000000..197fc5f2d2 --- /dev/null +++ b/python/paddle/fluid/contrib/int8_inference/utility.py @@ -0,0 +1,708 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle.fluid.core as core +import numpy as np +import math +import os +import paddle.fluid as fluid + + +class Calibrator(object): + ''' + The calibrator class transforms the program and updates the calculated scale into it. + This is INT8 v1 calibration tool, mainly for the support of ResNet-50 and MobileNet. + ''' + # TODO(guomingz): Below op list will be updated once more INT8 op kernels are supported. + non_conv_int8_op_type = ("pool2d") + supported_int8_op_type = ("conv2d", "pool2d") + const_sign_op_type = ('pool2d', 'reshape', 'concat', 'transpose') + u8_max = 255 + s8_max = 127 + + def __init__(self, *args, **kwargs): + self.program = kwargs['program'] + self.iterations = kwargs['iterations'] + self.pretrained_model = kwargs['pretrained_model'] + self.debug = kwargs['debug'] + self.algo = kwargs['algo'] + + self._conv_input_var_name = [] + self._conv_output_var_name = [] + self._pool2d_output_var_name = [] + self._weights_var_name = [] + self._residual_input_var_name = [] + self._int8_output_var_op_index_dict = {} + self._conv_op_index = [ + index for index, value in enumerate(self.program.global_block().ops) + if value.type == 'conv2d' + ] + + self._var_max_value_map = {} + self._var_max_range = {} + self._weights_scaling_factor = {} + self._u8_output_var = [] + self._s8_output_var = [] + self._persistable_vars = [] + + def generate_sampling_program(self): + self.__init_analysis() + self.__generate_output_program() + + def generate_quantized_data(self, sampling_data): + self.__sampling(sampling_data) + self.__save_scale() + self.__update_program() + self.__update_output_program_attr() + self.__display_debug() + + def __display_debug(self): + if self.debug: + self.__dot(self._output_program) + print(self._output_program) + + def __get_max_range_by_var_name(self, program, var_name): + """ + Check the specified variable was generated from Relu layer or not. + If the variable was the output of one of the pool2d/reshape/concat + /transpose, we keep trace the ancestor of this variable; + If the variable was the output the conv op, we check it's has_relu + attr; + Otherwise, we return the Calibrator.s8 as default value. + Returns: + Return Calibrator.u8_max if the variable was generated by Relu, + otherwise it will returns Calibrator.s8 + """ + search_end_index = -1 + input_index_name = {} + output_index_name = {} + ops_type = [] + + for index, op in enumerate(program.current_block().ops): + ops_type.append(op.type) + + input_index_name[index] = op.input_arg_names + + output_index_name[index] = op.output_arg_names + if var_name in op.output_arg_names: + search_end_index = index + + # analysis + while search_end_index >= 0: + if ops_type[search_end_index] == "relu": + return Calibrator.u8_max + + input_name = input_index_name[search_end_index][0] + + for i in output_index_name.keys(): + if input_name in output_index_name[i]: + search_end_index = i + break + + if ops_type[ + search_end_index] not in Calibrator.const_sign_op_type and ops_type[ + search_end_index] != 'conv2d': + return Calibrator.s8_max + + if ops_type[search_end_index] != 'conv2d': + continue + + if program.current_block().ops[search_end_index].has_attr( + 'fuse_relu') and program.current_block().ops[ + search_end_index].attr('fuse_relu'): + return Calibrator.u8_max + else: + return Calibrator.s8_max + + return Calibrator.s8_max + + def __check_op_type_with_specified_var_as_input(self, + program, + var_name, + start_index=0): + ''' + Check whether all the type of ops that use the specified variable as the + input.If one of those op is not int8-enabled, return False. + ''' + op_type_list = [ + op.type for op in program.current_block().ops[start_index:] + if var_name in op.input_arg_names + ] + for i in op_type_list: + if not i in Calibrator.supported_int8_op_type: + return False + return True + + def __check_var_source_dt(self, var_name): + ''' + Check whether the specified variable is the output of int8 conv op or not. + If true, return the original op index. + If false, return -1 + ''' + return self._int8_output_var_op_index_dict[ + var_name] if var_name in self._int8_output_var_op_index_dict else -1 + + def __update_int8_output_var_op_index_dict(self, index, var_name=None): + ''' + Update the int8_output_variable/op_index dictionary + ''' + for k, v in self._int8_output_var_op_index_dict.items(): + if v >= index: + self._int8_output_var_op_index_dict[k] = v + 1 + if var_name: + self._int8_output_var_op_index_dict[var_name] = index + + def __update_program(self): + ''' + Update the program with the quantize/dequantize op insertion. + ''' + quantize_index, dequantize_index = self.__get_quantize_dequantize_combination( + self._output_program) + inserted_op_length = 0 + calc_max_func = self.__get_optimal_scaling_factor if self.algo == "KL" else np.max + insert_op_collection = sorted(quantize_index + dequantize_index) + + for index in insert_op_collection: + if index in quantize_index: + quantize_tmp = self._output_program.current_block().create_var( + name="quantize_{}_tmp".format(index), + dtype=core.VarDesc.VarType.UINT8) + original_out_name = self._output_program.current_block().ops[ + index + inserted_op_length - 1].output_names[0] + original_out = self._output_program.current_block().ops[ + index + inserted_op_length - 1].output(original_out_name)[0] + + op = self._output_program.current_block()._insert_op( + index=index + inserted_op_length, + type="quantize", + inputs={"Input": original_out}, + outputs={"Output": quantize_tmp}, ) + + op._set_attr("data_format", "MKLDNNLAYOUT") + op._set_attr("use_mkldnn", 1) + op._set_attr( + "Scale", self._var_max_range[original_out] / + calc_max_func(self._var_max_value_map[original_out])) + + if self.__get_max_range_by_var_name( + self._output_program, + original_out) == Calibrator.s8_max: + op._set_attr("is_negative_input", 1) + + self.__update_int8_output_var_op_index_dict( + index + inserted_op_length, "quantize_{}_tmp".format(index)) + + inserted_op_length += 1 + for op in self._output_program.current_block().ops[ + index + inserted_op_length:]: + for j in op.input_names: + if op.input(j) and op.input( + j + )[0] == original_out and op.type in Calibrator.supported_int8_op_type: + op.desc.set_input(j, + ["{}".format(quantize_tmp.name)]) + else: + start_index = index + inserted_op_length + dequantize_tmp_var = self._output_program.current_block( + ).create_var( + name="dequantize_{}_tmp".format(index + 1), + dtype="float32", ) + original_out_var = None + + for original_input in self._output_program.current_block().ops[ + start_index].input_arg_names: + index_res = self.__get_op_index_by_output_var( + self._output_program, original_input) + if index_res != -1: + original_out_var = original_input + break + + if original_out_var: + op = self._output_program.current_block()._insert_op( + index=start_index, + type="dequantize", + inputs={"Input": original_out_var}, + outputs={"Output": dequantize_tmp_var}) + op._set_attr("data_format", "MKLDNNLAYOUT") + op._set_attr("use_mkldnn", 1) + op._set_attr("Scale", self._var_max_range[original_out_var] + / calc_max_func(self._var_max_value_map[ + original_out_var])) + + for op_index in range( + start_index + 1, + len(self._output_program.current_block().ops)): + if self._output_program.current_block( + ).ops[op_index].type == "conv2d" and self._output_program.current_block( + ).ops[op_index].attr("force_fp32_output"): + continue + else: + for j in self._output_program.current_block().ops[ + op_index].input_names: + if len(self._output_program.current_block().ops[ + op_index].input(j) + ) and self._output_program.current_block( + ).ops[op_index].input(j)[ + 0] == original_out_var: + self._output_program.current_block( + ).ops[op_index].desc.set_input( + j, + ["{}".format(dequantize_tmp_var.name)]) + + inserted_op_length += 1 + + op._set_attr("data_format", "MKLDNNLAYOUT") + op._set_attr("use_mkldnn", 1) + + def __update_output_program_attr(self): + for i in self._output_program.list_vars(): + if i.name in self._persistable_vars: + i.persistable = False + os.system("rm -rf {}/{}".format(self.pretrained_model, i.name)) + + for i in self._u8_output_var: + self._output_program.current_block().var(i).desc.set_dtype( + core.VarDesc.VarType.UINT8) + + for i in self._s8_output_var: + self._output_program.current_block().var(i).desc.set_dtype( + core.VarDesc.VarType.INT8) + + @property + def sampling_program(self): + return self._output_program + + @property + def sampling_vars(self): + return self._weights_var_name + self._conv_input_var_name + self._conv_output_var_name + self._residual_input_var_name + self._pool2d_output_var_name + + def _is_close(self, a, b, rel_tol=1e-09, abs_tol=0.0): + return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + + def __generate_output_program(self): + for i in self.program.list_vars(): + if not i.persistable and i.name in self.sampling_vars: + i.persistable = True + self._persistable_vars.append(i.name) + + self._output_program = self.program.clone() + + def __save_scale(self): + ''' + Update the convolution scale information. + ''' + func = self.__get_optimal_scaling_factor if self.algo == 'KL' else np.max + for i in self._conv_op_index[1:]: + weights_var_name = self.program.current_block().ops[i].input( + 'Filter')[0] + input_var_name = self.program.current_block().ops[i].input('Input')[ + 0] + output_var_name = self.program.current_block().ops[i].output( + 'Output')[0] + self._output_program.current_block().ops[i]._set_attr( + "Scale_weights", self._weights_scaling_factor[weights_var_name]) + + self._output_program.current_block().ops[i]._set_attr( + "Scale_in", self._var_max_range[input_var_name] / + func(self._var_max_value_map[input_var_name])) + self._output_program.current_block().ops[i]._set_attr( + "Scale_out", self._var_max_range[output_var_name] / + func(self._var_max_value_map[output_var_name])) + if self._output_program.current_block().ops[i].desc.input( + "ResidualData"): + residual_var_name = self._output_program.current_block().ops[ + i].desc.input("ResidualData")[0] + self._output_program.current_block().ops[i]._set_attr( + "Scale_in_eltwise", self._var_max_range[residual_var_name] / + func(self._var_max_value_map[residual_var_name])) + + def __sampling(self, sampling_data): + ''' + Sampling the variables data range. + ''' + for i in self.program.list_vars(): + if i.name not in self.sampling_vars: + continue + + if i.name in self._weights_var_name: + scaling_factor_per_channel = [] + data = sampling_data[i.name][0] + for j in range(data.shape[0]): + var_value = float(np.max(np.abs(data[j]))) + if not self._is_close(var_value, 0.0): + scaling_factor_per_channel.append(Calibrator.s8_max / + var_value) + else: + scaling_factor_per_channel.append(0.0) + self._weights_scaling_factor[ + i.name] = scaling_factor_per_channel + else: + if i.name in self._conv_output_var_name: + op_pos = self.__get_op_index_by_output_var(self.program, + i.name) + cur_op = self.program.current_block().ops[op_pos] + + if cur_op.has_attr('fuse_relu') and cur_op.attr( + 'fuse_relu'): + max_range = Calibrator.u8_max + self._u8_output_var.append(i.name) + else: + max_range = Calibrator.s8_max + self._s8_output_var.append(i.name) + else: + max_range = self.__get_max_range_by_var_name(self.program, + i.name) + max_value = [[np.abs(np_data)] + for np_data in sampling_data[i.name]] + + self._var_max_range[i.name] = max_range + self._var_max_value_map[i.name] = max_value + + def __check_force_fp32_attr_by_output_var(self, program, var_name): + for op in program.current_block().ops: + if op.type == "conv2d" and var_name in op.output_arg_names: + return op.attr("force_fp32_output") + return False + + def __get_op_index_by_output_var(self, program, var_name, start_index=0): + ''' + Check whether the specified input variable is the output of the + conv/pool2d op's output or not. + + Returns: + The index if the variable is the output of any conv/pool2d op's + output. + -1 when the variable is not the output of any conv/pool2d op's + output. + ''' + for index, op in enumerate(program.current_block().ops[start_index:]): + if var_name in op.output_arg_names and op.type in Calibrator.supported_int8_op_type: + return index + return -1 + + def __get_op_index_by_input_var(self, program, var_name, start_index=0): + ''' + Get the op index by specified input variable. + Returns: + The op index if the variable is the input of this op or -1 if the + variable is not the input of any op. + ''' + for index, op in enumerate(program.current_block().ops[start_index:]): + if var_name in op.input_arg_names: + return index + + return -1 + + def __get_quantize_dequantize_combination(self, program): + """ + Get the quantize/dequantize op index for further inserting. + Args: + The program desc. + Returns: + Two lists contains the quantize op and dequantize op index information. + """ + quantize_op_index = [] + dequantize_op_index = [] + minimal_conv_count = 2 # there must be two conv ops if not enable the first conv int8. + if len(self._conv_op_index) < minimal_conv_count: + return [], [] + + for index, value in enumerate(self._conv_op_index): + if index == 0: + quantize_op_index.append(self._conv_op_index[index + 1]) + elif index == len(self._conv_op_index) - 1: + output_var = program.current_block().ops[value].output( + "Output")[0] + if self.__check_op_type_with_specified_var_as_input( + program, output_var, index): + dequantize_op_index.append(self._conv_op_index[index] + 2) + else: + program.current_block().ops[value]._set_attr( + "force_fp32_output", True) + + elif self._conv_op_index[index] + 1 < self._conv_op_index[index + + 1]: + + program.current_block().ops[self._conv_op_index[ + index]]._set_attr("force_fp32_output", True) + + for op_index in range(self._conv_op_index[index + 1], + self._conv_op_index[index], -1): + op_type = program.current_block().ops[op_index].type + op_has_int8_input = False + input_var_name = None + input_length = len(program.current_block().ops[op_index] + .input_arg_names) + + for var_name in program.current_block().ops[ + op_index].input_arg_names: + if self.__check_var_source_dt(var_name) != -1: + op_has_int8_input = True + input_var_name = var_name + break + + if op_has_int8_input: + if op_type == "conv2d": + if program.current_block().ops[op_index + + 1].type == "conv2d": + continue + elif program.current_block( + ).ops[op_index + + 1].type in Calibrator.non_conv_int8_op_type: + dequantize_op_index.append(op_index + 2) + break + else: + program.current_block().ops[op_index]._set_attr( + "force_fp32_output", True) + continue + elif not self.__check_force_fp32_attr_by_output_var( + program, input_var_name + ) and op_index not in dequantize_op_index: + share_input_flag = True + for input_attr_name in program.current_block().ops[ + op_index].input_names: + input_var_name = program.current_block().ops[ + op_index].input(input_attr_name)[0] + cousin_op_index = self.__get_op_index_by_input_var( + program, input_var_name) + if cousin_op_index != -1 and cousin_op_index in dequantize_op_index: + share_input_flag = False + break + if share_input_flag: + dequantize_op_index.append(op_index) + + elif input_length: + output_is_to_int8_op = False + share_input_flag = True + for var_name in program.current_block().ops[ + op_index].input_arg_names: + if not self.__check_op_type_with_specified_var_as_input( + program, var_name): + share_input_flag = False + break + + for var_name in program.current_block().ops[ + op_index].output_arg_names: + if self.__get_op_index_by_output_var( + program, var_name, op_index) != -1: + output_is_to_int8_op = True + break + + if share_input_flag or output_is_to_int8_op: + quantize_op_index.append(op_index) + + return quantize_op_index, dequantize_op_index + + def __init_analysis(self): + ''' + Collect the variable names for sampling. + ''' + start_index = 1 #analysis the conv op detail from second conv op. + + for i in self._conv_op_index[start_index:]: + self._weights_var_name.append(self.program.current_block().ops[i] + .input('Filter')[0]) + self._conv_input_var_name.append(self.program.current_block().ops[i] + .input('Input')[0]) + self._conv_output_var_name.append(self.program.current_block().ops[ + i].output('Output')[0]) + self._int8_output_var_op_index_dict[self.program.current_block() + .ops[i].output('Output')[0]] = i + if self.program.current_block().ops[i].desc.input("ResidualData"): + self._residual_input_var_name.append(self.program.current_block( + ).ops[i].desc.input("ResidualData")[0]) + + if self.program.current_block().ops[i + 1].type == "pool2d": + self._pool2d_output_var_name.append(self.program.current_block( + ).ops[i + 1].output('Out')[0]) + + def __expand_quantized_bins(self, quantized_bins, reference_bins): + expanded_quantized_bins = [0] * len(reference_bins) + num_merged_bins = len(reference_bins) / len(quantized_bins) + j_start = 0 + j_end = num_merged_bins + for idx in xrange(len(quantized_bins)): + zero_count = reference_bins[j_start:j_end].count(0) + num_merged_bins = j_end - j_start + if zero_count == num_merged_bins: + avg_bin_ele = 0 + else: + avg_bin_ele = quantized_bins[idx] / ( + num_merged_bins - zero_count + 0.0) + for idx1 in xrange(j_start, j_end): + expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0 + else avg_bin_ele) + j_start += num_merged_bins + j_end += num_merged_bins + if (idx + 1) == len(quantized_bins) - 1: + j_end = len(reference_bins) + return expanded_quantized_bins + + def __safe_entropy(self, reference_distr_P, P_sum, candidate_distr_Q, + Q_sum): + ''' + Calculate the entropy. + ''' + assert len(reference_distr_P) == len(candidate_distr_Q) + tmp_sum1 = 0 + tmp_sum2 = 0 + for idx in range(len(reference_distr_P)): + p_idx = reference_distr_P[idx] + q_idx = candidate_distr_Q[idx] + if p_idx == 0: + tmp_sum1 += 0 + tmp_sum2 += 0 + else: + if q_idx == 0: + print("Fatal error!, idx = " + str(idx) + + " qindex = 0! p_idx = " + str(p_idx)) + tmp_sum1 += p_idx * (math.log(Q_sum * p_idx)) + tmp_sum2 += p_idx * (math.log(P_sum * q_idx)) + return (tmp_sum1 - tmp_sum2) / P_sum + + # Reference: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf + def __get_optimal_scaling_factor(self, + activation_blob, + num_quantized_bins=255): + ''' + Using the KL-divergenc method to get the more precise scaling factor. + ''' + max_val = np.max(activation_blob) + min_val = np.min(activation_blob) + if min_val >= 0: + hist, hist_edeges = np.histogram( + activation_blob, bins=2048, range=(min_val, max_val)) + ending_iter = 2047 + starting_iter = int(ending_iter * 0.7) + else: + th = max(abs(max_val), abs(min_val)) + hist, hist_edeges = np.histogram( + activation_blob, bins=2048, range=(-th, th)) + starting_iter = 0 + ending_iter = 2047 + if abs(max_val) > abs(min_val): + while starting_iter < ending_iter: + if hist[starting_iter] == 0: + starting_iter += 1 + continue + else: + break + starting_iter += int((ending_iter - starting_iter) * 0.6) + else: + while ending_iter > 0: + if hist[ending_iter] == 0: + ending_iter -= 1 + continue + else: + break + starting_iter = int(0.6 * ending_iter) + bin_width = hist_edeges[1] - hist_edeges[0] + P_sum = len(activation_blob) + min_kl_divergence = 0 + min_kl_index = 0 + kl_inited = False + for i in range(starting_iter, ending_iter + 1): + reference_distr_P = hist[0:i].tolist() + outliers_count = sum(hist[i:2048]) + if reference_distr_P[i - 1] == 0: + continue + reference_distr_P[i - 1] += outliers_count + reference_distr_bins = reference_distr_P[:] + candidate_distr_Q = hist[0:i].tolist() + num_merged_bins = i / num_quantized_bins + candidate_distr_Q_quantized = [0] * num_quantized_bins + j_start = 0 + j_end = num_merged_bins + for idx in xrange(num_quantized_bins): + candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[ + j_start:j_end]) + j_start += num_merged_bins + j_end += num_merged_bins + if (idx + 1) == num_quantized_bins - 1: + j_end = i + candidate_distr_Q = self.__expand_quantized_bins( + candidate_distr_Q_quantized, reference_distr_bins) + Q_sum = sum(candidate_distr_Q) + kl_divergence = self.__safe_entropy(reference_distr_P, P_sum, + candidate_distr_Q, Q_sum) + if not kl_inited: + min_kl_divergence = kl_divergence + min_kl_index = i + kl_inited = True + elif kl_divergence < min_kl_divergence: + min_kl_divergence = kl_divergence + min_kl_index = i + else: + pass + if min_kl_index == 0: + while starting_iter > 0: + if hist[starting_iter] == 0: + starting_iter -= 1 + continue + else: + break + min_kl_index = starting_iter + return (min_kl_index + 0.5) * bin_width + + @staticmethod + def __dot(program, output_name="model.dot"): + ''' + Generate the graphiz dot file for debugging. + ''' + dot_graph = "" + dot_nodes = [] + dot_edges = [] + dot_graph += "digraph pm {\n" + for block in program.blocks: + ops = list(block.ops) + for index, op in enumerate(ops): + op_type = op.type + op_name = op_type + "_" + op.output_arg_names[0].replace( + ".", "_") + "___" + str(index) + for name in op.input_arg_names: + name = name.replace(".", "_") + dot_edge = name + " -> " + op_name + if dot_edge not in dot_edges: + dot_edges.append(dot_edge) + dot_node = name + " [shape=oval, style=filled, fillcolor=yellow]" + if dot_node not in dot_nodes: + dot_nodes.append(dot_node) + + for name in op.output_arg_names: + name = name.replace(".", "_") + dot_edge = op_name + " -> " + name + if dot_edge not in dot_edges: + dot_edges.append(dot_edge) + if op_type in Calibrator.supported_int8_op_type: + if op_type == "conv2d" and op.has_attr( + 'force_fp32_output') and op.attr( + "force_fp32_output"): + dot_node = op_name + " [shape=box, style=filled, color=deeppink]" + else: + dot_node = op_name + " [shape=box, style=filled, color=greenyellow]" + elif op_type in ["quantize", "dequantize"]: + dot_node = op_name + " [shape=box, style=filled, color=gold]" + else: + dot_node = op_name + " [shape=box, style=filled, fillcolor=red]" + + if dot_node not in dot_nodes: + dot_nodes.append(dot_node) + + for dot_edge in dot_edges: + dot_graph += dot_edge + "\n" + for dot_node in dot_nodes: + dot_graph += dot_node + "\n" + dot_graph += "}" + + with open(output_name, 'w') as f: + f.write(dot_graph) diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt index 79bec8c4ad..81aee1233d 100644 --- a/python/paddle/fluid/contrib/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt @@ -1,6 +1,10 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +if(APPLE OR WIN32 OR NOT WITH_MKL) + list(REMOVE_ITEM TEST_OPS test_calibration) +endif() + foreach(src ${TEST_OPS}) py_test(${src} SRCS ${src}.py) endforeach() diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py new file mode 100644 index 0000000000..17e4eb8b83 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_calibration.py @@ -0,0 +1,230 @@ +# copyright (c) 2018 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. +import unittest +import os +import numpy as np +import time +import sys +import random +import paddle +import paddle.fluid as fluid +import argparse +import functools +import contextlib +import paddle.fluid.profiler as profiler +from PIL import Image, ImageEnhance +import math +sys.path.append('..') +import int8_inference.utility as ut + +random.seed(0) +np.random.seed(0) + +DATA_DIM = 224 + +THREAD = 1 +BUF_SIZE = 102400 + +DATA_DIR = 'data/ILSVRC2012' + +img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) +img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) + + +# TODO(guomingz): Remove duplicated code from line 45 ~ line 114 +def resize_short(img, target_size): + percent = float(target_size) / min(img.size[0], img.size[1]) + resized_width = int(round(img.size[0] * percent)) + resized_height = int(round(img.size[1] * percent)) + img = img.resize((resized_width, resized_height), Image.LANCZOS) + return img + + +def crop_image(img, target_size, center): + width, height = img.size + size = target_size + if center == True: + w_start = (width - size) / 2 + h_start = (height - size) / 2 + else: + w_start = np.random.randint(0, width - size + 1) + h_start = np.random.randint(0, height - size + 1) + w_end = w_start + size + h_end = h_start + size + img = img.crop((w_start, h_start, w_end, h_end)) + return img + + +def process_image(sample, mode, color_jitter, rotate): + img_path = sample[0] + + img = Image.open(img_path) + + img = resize_short(img, target_size=256) + img = crop_image(img, target_size=DATA_DIM, center=True) + + if img.mode != 'RGB': + img = img.convert('RGB') + + img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255 + img -= img_mean + img /= img_std + + return img, sample[1] + + +def _reader_creator(file_list, + mode, + shuffle=False, + color_jitter=False, + rotate=False, + data_dir=DATA_DIR): + def reader(): + with open(file_list) as flist: + full_lines = [line.strip() for line in flist] + if shuffle: + np.random.shuffle(full_lines) + + lines = full_lines + + for line in lines: + img_path, label = line.split() + img_path = os.path.join(data_dir, img_path) + if not os.path.exists(img_path): + continue + yield img_path, int(label) + + mapper = functools.partial( + process_image, mode=mode, color_jitter=color_jitter, rotate=rotate) + + return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE) + + +def val(data_dir=DATA_DIR): + file_list = os.path.join(data_dir, 'val_list.txt') + return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir) + + +class TestCalibration(unittest.TestCase): + def setUp(self): + # TODO(guomingz): Put the download process in the cmake. + # Download and unzip test data set + imagenet_dl_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz' + zip_file_name = imagenet_dl_url.split('/')[-1] + cmd = 'rm -rf data {} && mkdir data && wget {} && tar xvf {} -C data'.format( + zip_file_name, imagenet_dl_url, zip_file_name) + os.system(cmd) + # resnet50 fp32 data + resnet50_fp32_model_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz' + resnet50_zip_name = resnet50_fp32_model_url.split('/')[-1] + resnet50_unzip_folder_name = 'resnet50_fp32' + cmd = 'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'.format( + resnet50_unzip_folder_name, resnet50_zip_name, + resnet50_unzip_folder_name, resnet50_fp32_model_url, + resnet50_zip_name, resnet50_unzip_folder_name) + os.system(cmd) + + self.iterations = 100 + self.skip_batch_num = 5 + + def run_program(self, model_path, generate_int8=False, algo='direct'): + image_shape = [3, 224, 224] + os.environ['FLAGS_use_mkldnn'] = 'True' + + fluid.memory_optimize(fluid.default_main_program()) + + exe = fluid.Executor(fluid.CPUPlace()) + + [infer_program, feed_dict, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) + + t = fluid.transpiler.InferenceTranspiler() + t.transpile(infer_program, fluid.CPUPlace()) + + val_reader = paddle.batch(val(), batch_size=1) + + if generate_int8: + int8_model = os.path.join(os.getcwd(), "calibration_out") + + if os.path.exists(int8_model): + os.system("rm -rf " + int8_model) + os.system("mkdir " + int8_model) + + print("Start calibration ...") + + calibrator = ut.Calibrator( + program=infer_program, + pretrained_model=model_path, + iterations=100, + debug=False, + algo=algo) + + sampling_data = {} + + calibrator.generate_sampling_program() + test_info = [] + cnt = 0 + for batch_id, data in enumerate(val_reader()): + image = np.array( + [x[0].reshape(image_shape) for x in data]).astype("float32") + label = np.array([x[1] for x in data]).astype("int64") + label = label.reshape([-1, 1]) + running_program = calibrator.sampling_program.clone( + ) if generate_int8 else infer_program.clone() + for op in running_program.current_block().ops: + if op.has_attr("use_mkldnn"): + op._set_attr("use_mkldnn", True) + + _, acc1, _ = exe.run( + running_program, + feed={feed_dict[0]: image, + feed_dict[1]: label}, + fetch_list=fetch_targets) + if generate_int8: + for i in calibrator.sampling_program.list_vars(): + if i.name in calibrator.sampling_vars: + np_data = np.array(fluid.global_scope().find_var(i.name) + .get_tensor()) + if i.name not in sampling_data: + sampling_data[i.name] = [] + sampling_data[i.name].append(np_data) + + test_info.append(np.mean(acc1) * len(data)) + cnt += len(data) + + if batch_id != self.iterations - 1: + continue + + break + + if generate_int8: + calibrator.generate_quantized_data(sampling_data) + fluid.io.save_inference_model(int8_model, feed_dict, fetch_targets, + exe, calibrator.sampling_program) + print( + "Calibration is done and the corresponding files were generated at {}". + format(os.path.abspath("calibration_out"))) + else: + return np.sum(test_info) / cnt + + def test_calibration_for_resnet50(self): + fp32_acc1 = self.run_program("resnet50_fp32/model") + self.run_program("resnet50_fp32/model", True) + int8_acc1 = self.run_program("calibration_out") + delta_value = np.abs(fp32_acc1 - int8_acc1) + self.assertLess(delta_value, 0.01) + + +if __name__ == '__main__': + unittest.main() From 5c68dee798754fec0bfd19225dbbe825cbce5c63 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 23 Jan 2019 09:33:23 +0000 Subject: [PATCH 080/123] fix debug compile of analysis pass fail test=develop --- paddle/fluid/inference/analysis/passes/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index 691c336ebe..9d74dc6c21 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass) +cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) From 51227bd4473c8cad19fe026d397dde11025a4ac6 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 23 Jan 2019 11:56:24 +0000 Subject: [PATCH 081/123] lazy_allocator test=develop --- paddle/fluid/platform/gpu_info.cc | 19 ++++++++++++++++++- paddle/fluid/pybind/pybind.cc | 20 ++++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index ca89d91aad..a402407709 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include +#include +#include #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" @@ -58,7 +60,17 @@ DEFINE_string(selected_gpus, "", namespace paddle { namespace platform { -int GetCUDADeviceCount() { +static int GetCUDADeviceCountImpl() { + const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES"); + if (cuda_visible_devices != nullptr) { + std::string cuda_visible_devices_str(cuda_visible_devices); + if (std::all_of(cuda_visible_devices_str.begin(), + cuda_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + return 0; + } + } + int count; PADDLE_ENFORCE( cudaGetDeviceCount(&count), @@ -66,6 +78,11 @@ int GetCUDADeviceCount() { return count; } +int GetCUDADeviceCount() { + static auto dev_cnt = GetCUDADeviceCountImpl(); + return dev_cnt; +} + int GetCUDAComputeCapability(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); cudaDeviceProp device_prop; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c470483756..81546b33f5 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -626,7 +626,18 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Communicator").def(py::init<>()); #endif py::class_(m, "CUDAPlace") - .def(py::init()) + .def("__init__", + [](platform::CUDAPlace &self, int dev_id) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE( + dev_id >= 0 && dev_id < platform::GetCUDADeviceCount(), + "Invalid CUDAPlace(%d), must inside [0, %d)", dev_id, + platform::GetCUDADeviceCount()); + new (&self) platform::CUDAPlace(dev_id); +#else + PADDLE_THROW("Cannot use CUDAPlace in CPU only version"); +#endif + }) .def("__str__", string::to_string); py::class_(m, "CPUPlace") @@ -634,7 +645,12 @@ All parameter, weight, gradient are variables in Paddle. .def("__str__", string::to_string); py::class_(m, "CUDAPinnedPlace") - .def(py::init<>()) + .def("__init__", + [](platform::CUDAPinnedPlace &) { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version"); +#endif + }) .def("__str__", string::to_string); py::class_(m, "Place") From eed4a6383d6688dfcb39e29c4219d7c310f1d425 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 24 Jan 2019 02:11:11 +0000 Subject: [PATCH 082/123] disable eager deletion unittest test=develop --- .../fluid/tests/unittests/test_eager_deletion_mnist.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py index 7ec1f0ae75..56dfb095de 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -16,12 +16,17 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" +# FIXME(zjl): It seems that this unittest fails randomly +# when comparing all reduce last loss and reduce last loss +# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta +# Disable it temporarily. +''' from test_parallel_executor_mnist import TestMNIST class EagerDeletionTestMNIST(TestMNIST): pass - +''' if __name__ == '__main__': unittest.main() From 5d026a881a6007fc2332b1cbf452220fe3ee6985 Mon Sep 17 00:00:00 2001 From: liuwei1031 <46661762+liuwei1031@users.noreply.github.com> Date: Thu, 24 Jan 2019 10:36:28 +0800 Subject: [PATCH 083/123] Gpu memory monitoring (#15436) * fix github issue 15267 test=develop * fix github issue 15267 test=develop * monitor the GPU usage during runtime * revert allocator_facade.cc change * comments update test=develop --- .../memory/allocation/legacy_allocator.cc | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 64aa63ffe9..5d8684f083 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include +#include #include #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" @@ -37,7 +38,7 @@ template void *Alloc(const Place &place, size_t size); template -void Free(const Place &place, void *p); +void Free(const Place &place, void *p, size_t size); template size_t Used(const Place &place); @@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p); using BuddyAllocator = detail::BuddyAllocator; +std::unordered_map> + gpu_mem_info; + BuddyAllocator *GetCPUBuddyAllocator() { // We tried thread_local for inference::RNN1 model, but that not works much // for multi-thread test. @@ -98,7 +104,8 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { } template <> -void Free(const platform::CPUPlace &place, void *p) { +void Free(const platform::CPUPlace &place, void *p, + size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -177,9 +184,16 @@ void *Alloc(const platform::CUDAPlace &place, LOG(WARNING) << "GPU memory used: " << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); - } - if (FLAGS_init_allocated_mem) { - cudaMemset(ptr, 0xEF, size); + } else { + gpu_mem_info[place.device].first += size; + if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) { + gpu_mem_info[place.device].second = gpu_mem_info[place.device].first; + VLOG(3) << "device: " << place.device << " peak memory usage : " + << (gpu_mem_info[place.device].second >> 20) << " MiB"; + } + if (FLAGS_init_allocated_mem) { + cudaMemset(ptr, 0xEF, size); + } } return ptr; #else @@ -188,9 +202,11 @@ void *Alloc(const platform::CUDAPlace &place, } template <> -void Free(const platform::CUDAPlace &place, void *p) { +void Free(const platform::CUDAPlace &place, void *p, + size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); + gpu_mem_info[place.device].first -= size; #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -243,7 +259,7 @@ void *Alloc(const platform::CUDAPinnedPlace &place, template <> void Free(const platform::CUDAPinnedPlace &place, - void *p) { + void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor { }; struct FreeVisitor : public boost::static_visitor { - inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {} + inline explicit FreeVisitor(void *ptr, size_t size) + : ptr_(ptr), size_(size) {} template inline void operator()(const Place &place) const { - Free(place, ptr_); + Free(place, ptr_, size_); } private: void *ptr_; + size_t size_; }; size_t Usage::operator()(const platform::CPUPlace &cpu) const { @@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { } void LegacyAllocator::Free(Allocation *allocation) { - boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()), - allocation->place()); + boost::apply_visitor( + legacy::FreeVisitor(allocation->ptr(), allocation->size()), + allocation->place()); delete allocation; } } // namespace allocation From 484b3bc80189824b4d3deb3af94273838a6c7480 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 24 Jan 2019 03:53:32 +0000 Subject: [PATCH 084/123] When cudnn version < 7100, there is problem with conv_fusion. Add check for it. test=develop --- paddle/fluid/inference/api/demo_ci/vis_demo.cc | 2 ++ paddle/fluid/inference/tests/api/CMakeLists.txt | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 5320992b7e..2c6093fa48 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -82,10 +82,12 @@ void Main(bool use_gpu) { int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); +#if CUDNN_VERSION >= 7100 if (FLAGS_use_gpu) { paddle::demo::Main(true /*use_gpu*/); } else { paddle::demo::Main(false /*use_gpu*/); } +#endif return 0; } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 423c39813f..cb4af28dc9 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -144,7 +144,8 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI endif() endif() -if(WITH_GPU AND TENSORRT_FOUND) +if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) + if(WITH_GPU AND TENSORRT_FOUND) set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") @@ -152,4 +153,5 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_analysis_test(test_trt_models SRCS trt_models_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) + endif() endif() From 96413249951c4144aa6a143e38aa2715d7e50dc1 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 24 Jan 2019 05:05:45 +0000 Subject: [PATCH 085/123] fix comments test=develop --- paddle/fluid/inference/api/demo_ci/vis_demo.cc | 2 -- .../fluid/inference/api/paddle_pass_builder.h | 17 ++++++++++------- paddle/fluid/inference/tests/api/CMakeLists.txt | 4 +--- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 2c6093fa48..5320992b7e 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -82,12 +82,10 @@ void Main(bool use_gpu) { int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); -#if CUDNN_VERSION >= 7100 if (FLAGS_use_gpu) { paddle::demo::Main(true /*use_gpu*/); } else { paddle::demo::Main(false /*use_gpu*/); } -#endif return 0; } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index d3a60d2099..391932a1ee 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // +#endif }); for (int i = 6; i >= 3; i--) { diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index cb4af28dc9..423c39813f 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -144,8 +144,7 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI endif() endif() -if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) - if(WITH_GPU AND TENSORRT_FOUND) +if(WITH_GPU AND TENSORRT_FOUND) set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") @@ -153,5 +152,4 @@ if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) inference_analysis_test(test_trt_models SRCS trt_models_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) - endif() endif() From 5cfc40dea862d963dd0d2625d780cae4d33cda60 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 24 Jan 2019 14:11:00 +0800 Subject: [PATCH 086/123] nce add check sample lables, test=develop (#15463) * nce add check sample lables, test=develop --- paddle/fluid/operators/math/sampler.cc | 10 +++++++++- paddle/fluid/operators/math/sampler.h | 1 + paddle/fluid/operators/nce_op.h | 5 +++++ python/paddle/fluid/layers/nn.py | 8 ++++---- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc index 2708f3bcd8..238d9f2905 100644 --- a/paddle/fluid/operators/math/sampler.cc +++ b/paddle/fluid/operators/math/sampler.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" +#include #include #include #include @@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const { auto index = (*int_dist_)(*random_engine_); auto p = (*real_dist_)(*random_engine_); if (p > alias_probs_[index]) { - return alias_[index]; + int alias = alias_[index]; + + if (alias == exceptional_val) { + LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val; + return index; + } + + return alias; } else { return index; } diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h index 98e0b898a5..3fa5a7ae33 100644 --- a/paddle/fluid/operators/math/sampler.h +++ b/paddle/fluid/operators/math/sampler.h @@ -116,6 +116,7 @@ class CustomSampler : public Sampler { const float* alias_probs_; const int* alias_; const float* probs_; + const int exceptional_val = -1; std::shared_ptr random_engine_; std::shared_ptr> real_dist_; std::shared_ptr> int_dist_; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 2c97eef096..3e48b67a57 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel { PrepareSamples(context, sampler); auto sample_labels = context.Output("SampleLabels"); const int64_t *sample_labels_data = sample_labels->data(); + + for (int x = 0; x < sample_labels->numel(); x++) { + PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x); + } + auto sample_out = context.Output("SampleLogits"); T *sample_out_data = sample_out->mutable_data(context.GetPlace()); auto label = context.Input("Label"); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e2a4c05926..0116eb10d4 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5146,9 +5146,9 @@ def nce(input, littles = [] for i in range(custom_dist_len): normal_prob = custom_dist[i] * custom_dist_len - if normal_prob - 1.0 > 1e-4: + if normal_prob - 1.0 > 0: bigs.append((i, normal_prob)) - elif 1.0 - normal_prob > 1e-4: + elif 1.0 - normal_prob > 0: littles.append((i, normal_prob)) else: alias_probs_[i] = normal_prob @@ -5164,9 +5164,9 @@ def nce(input, alias_probs_[little[0]] = little[1] alias_[little[0]] = big_idx big_left = big[1] + little[1] - 1 - if big_left - 1.0 > 1e-4: + if big_left - 1.0 > 0: bigs.append((big_idx, big_left)) - elif 1.0 - big_left > 1e-4: + elif 1.0 - big_left > 0: littles.append((big_idx, big_left)) else: alias_probs_[big_idx] = big_left From bf91d11ed524fea0dedd49fdaf4e558f9d04afe5 Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 24 Jan 2019 00:13:50 -0600 Subject: [PATCH 087/123] Clean elementwise_op_function (#15502) test=develop --- .../elementwise/elementwise_op_function.h | 62 ------------------- 1 file changed, 62 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 7bb6934e14..cb8a4e7e15 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -277,68 +277,6 @@ class TransformFunctor { Functor func_; }; -#define EIGEN_FUNCTOR(name, eigen_op) \ - struct Eigen##name##Functor { \ - template \ - inline void Run(const framework::Tensor *x, const framework::Tensor *y, \ - framework::Tensor *z, \ - const framework::ExecutionContext &ctx) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_e); \ - } \ - template \ - inline void RunBroadCast(const framework::Tensor *x, \ - const framework::Tensor *y, framework::Tensor *z, \ - const framework::ExecutionContext &ctx, int pre, \ - int n) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) \ - .broadcast(Eigen::DSizes(pre, 1)) \ - .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_bcast); \ - } \ - template \ - inline void RunBroadCast2(const framework::Tensor *x, \ - const framework::Tensor *y, \ - framework::Tensor *z, \ - const framework::ExecutionContext &ctx, int pre, \ - int n, int post) { \ - auto x_e = framework::EigenVector::Flatten(*x); \ - auto y_e = framework::EigenVector::Flatten(*y); \ - auto z_e = framework::EigenVector::Flatten(*z); \ - auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) \ - .broadcast(Eigen::DSizes(pre, 1, post)) \ - .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device( \ - *ctx.template device_context().eigen_device()) = \ - eigen_op(x_e, y_bcast); \ - } \ - } - -#define EIGEN_ADD(x, y) ((x) + (y)) - -EIGEN_FUNCTOR(Add, EIGEN_ADD); - -#define EIGEN_SUB(x, y) ((x) - (y)) - -EIGEN_FUNCTOR(Sub, EIGEN_SUB); - -#define EIGEN_MUL(x, y) ((x) * (y)) - -EIGEN_FUNCTOR(Mul, EIGEN_MUL); - -#define EIGEN_DIV(x, y) ((x) / (y)) - -EIGEN_FUNCTOR(Div, EIGEN_DIV); - template struct ElemwiseGradNoBroadcast { const T *x_; From 22db82c05358ce112cc4f93299da26f6b546a8cd Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Thu, 24 Jan 2019 16:24:56 +0800 Subject: [PATCH 088/123] fix tangwei merge issue test=develop (#15506) --- python/paddle/fluid/parallel_executor.py | 2 +- python/paddle/fluid/transpiler/distribute_transpiler.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a1b1d2f584..a07ff6ac69 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -159,7 +159,7 @@ class ParallelExecutor(object): trainers_endpoints = main._trainers_endpoints if num_trainers > 1 and trainers_endpoints: assert num_trainers == len( - trainers_endpoints), "num_trainers == len(end_points)" + trainers_endpoints), "num_trainers == len(endpoints)" build_strategy.trainers_endpoints = trainers_endpoints # step6: get persistable_vars, places. persistable_vars diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index c61cb54e1f..e58f34e375 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -477,13 +477,16 @@ class DistributeTranspiler(object): trainer_id, trainers, current_endpoint, - startup_program=None): + startup_program=None, + wait_port=True): if not startup_program: startup_program = default_startup_program() if trainer_id >= 0: worker_endpoints = trainers.split(",") # send NCCL_ID to others or recv from trainer 0 worker_endpoints.remove(current_endpoint) + if trainer_id == 0 and wait_port: + wait_server_ready(worker_endpoints) nccl_id_var = startup_program.global_block().create_var( name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) @@ -564,11 +567,13 @@ class DistributeTranspiler(object): if self.config.mode == "nccl2": assert (isinstance(trainers, str)) + self.origin_program._trainers_endpoints = trainers.split(",") self._transpile_nccl2( trainer_id, trainers, current_endpoint, - startup_program=startup_program) + startup_program=startup_program, + wait_port=self.config.wait_port) return self.trainer_num = trainers From 0779e355442619f80a94617caf10c8cb7abab1ab Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 24 Jan 2019 13:27:55 +0000 Subject: [PATCH 089/123] fix two bug: 1. graph and program_desc alignment 2. trt stream test=develop --- paddle/fluid/framework/ir/graph_traits.cc | 3 ++- paddle/fluid/inference/analysis/ir_pass_manager.cc | 9 ++++++--- paddle/fluid/inference/analysis/ir_pass_manager.h | 4 ++-- .../analysis/passes/ir_graph_to_program_pass.cc | 6 +++++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index 2ee12cc410..929d9edc34 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/graph_traits.h" +#include #include namespace paddle { @@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector &source) { } std::unordered_set visited; - std::unordered_set to_visit{source.begin(), source.end()}; + std::set to_visit{source.begin(), source.end()}; std::vector inlink_visited; while (!to_visit.empty()) { diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 376d6aef20..9aaae16144 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -74,8 +74,9 @@ void IRPassManager::CreatePasses(Argument *argument, bool enable_int8 = false; if (argument->tensorrt_precision_mode() == - contrib::AnalysisConfig::Precision::kInt8) + contrib::AnalysisConfig::Precision::kInt8) { enable_int8 = true; + } pass->Set("enable_int8", new bool(enable_int8)); pass->Set("model_dir", new std::string(argument->model_path())); @@ -103,12 +104,14 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { } framework::proto::ProgramDesc IRPassManager::AcquireProgram( - std::unique_ptr *graph, const ProgramDesc &program) const { + std::unique_ptr *graph, ProgramDesc *program) const { auto pass = framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. ProgramDesc desc; - desc.CopyFrom(*const_cast(program).Proto()); + desc.CopyFrom(*program->Proto()); pass->SetNotOwned("program", &desc); auto *the_graph = graph->release(); *graph = pass->Apply(std::unique_ptr(the_graph)); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 983a582649..f378d35d9a 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -42,8 +42,8 @@ class IRPassManager final { std::unique_ptr Apply(std::unique_ptr graph); - framework::proto::ProgramDesc AcquireProgram( - std::unique_ptr *graph, const ProgramDesc &program) const; + framework::proto::ProgramDesc AcquireProgram(std::unique_ptr *graph, + ProgramDesc *program) const; framework::ir::Graph &graph() const { return *graph_; } diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc index f1da37af3c..6b3d80fcef 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) { } std::unique_ptr graph(argument->main_graph_ptr()); - framework::ProgramDesc desc(argument->main_program()); + + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + framework::ProgramDesc desc; + desc.CopyFrom(*argument->main_program().Proto()); pass->SetNotOwned("program", &desc); auto thegraph = pass->Apply(std::move(graph)); thegraph.release(); // the argument still own the graph. From 3008fa1261ead553549aee8da576147701b7ef08 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 24 Jan 2019 21:35:50 +0800 Subject: [PATCH 090/123] Add the CUDA kernel for beam_search op (#15020) * Refine the beam_search op and test. * A basic CUDA implementation of beam_search for small batch_size. * Implement CUDA kernel for beam_search_op. * Use multiple CUDA threads in the same block to select the top beam. * Update the python api of beam_search op. * Enable extend function in CPU kernel of beam_search op. * Unify the CUDA codes. test=develop * Unify the CPU kernel of beam_search op. * Ensure the seletced items of beam_search_op's CPU kernel sorted by scores. * Update the description of beam_search in API.spec. * Enable the use of CUDA kernel in beam_search op. * Exclude the beam_search's CUDA unittest when there is no CUDA gpu, and delete some debuging statements. test=develop * Follow comments. test=develop * Call the CPU kernel for beam_search op when batch_size > 4. test=develop * Remove the except of is_empty op in PrepareData. test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/framework/lod_tensor.cc | 7 +- paddle/fluid/framework/mixed_vector.h | 18 +- paddle/fluid/operators/CMakeLists.txt | 3 +- paddle/fluid/operators/beam_search_op.cc | 235 ++--------- paddle/fluid/operators/beam_search_op.cu.cc | 24 ++ paddle/fluid/operators/beam_search_op.h | 191 +-------- paddle/fluid/operators/beam_search_op_test.cc | 92 ---- paddle/fluid/operators/bpr_loss_op.h | 4 +- paddle/fluid/operators/math/CMakeLists.txt | 2 + paddle/fluid/operators/math/beam_search.cc | 283 +++++++++++++ paddle/fluid/operators/math/beam_search.cu | 393 ++++++++++++++++++ paddle/fluid/operators/math/beam_search.h | 119 ++++++ .../fluid/operators/math/beam_search_test.cc | 141 +++++++ .../math/selected_rows_functor_test.cc | 2 +- .../math/selected_rows_functor_test.cu.cc | 2 +- .../operators/math/sequence_pooling_test.cc | 2 +- paddle/fluid/platform/cuda_device_function.h | 29 ++ paddle/fluid/platform/gpu_info.cc | 8 +- python/paddle/fluid/layers/nn.py | 38 +- 20 files changed, 1081 insertions(+), 514 deletions(-) create mode 100644 paddle/fluid/operators/beam_search_op.cu.cc delete mode 100644 paddle/fluid/operators/beam_search_op_test.cc create mode 100644 paddle/fluid/operators/math/beam_search.cc create mode 100644 paddle/fluid/operators/math/beam_search.cu create mode 100644 paddle/fluid/operators/math/beam_search.h create mode 100644 paddle/fluid/operators/math/beam_search_test.cc diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 4d040d219a..7ceb180193 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) -paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) +paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 8fbbc6584e..f46bdf96ba 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) { std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { if (!platform::is_cpu_place(t.place())) { - LoDTensor tt; - framework::TensorCopy(t, platform::CPUPlace(), &tt); + LoDTensor cpu_tensor; + cpu_tensor.set_lod(t.lod()); + framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(t.place()); dev_ctx.Wait(); - os << tt; + os << cpu_tensor; return os; } diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index c3a044d22c..5d854cb8d7 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 992a2bdd5a..76419a2ea2 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) endif() @@ -86,7 +86,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) -cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 30f700f1d9..e78ecc1a12 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include +#include "paddle/fluid/operators/beam_search_op.h" + #include #include - -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/beam_search_op.h" namespace paddle { namespace operators { -void BeamSearch::operator()(const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores, - framework::LoDTensor *selected_ids, - framework::LoDTensor *selected_scores) { - auto abs_lod = framework::ToAbsOffset(ids_->lod()); - auto &high_level = abs_lod[lod_level_]; - - auto items = SelectTopBeamSizeItems(pre_ids, pre_scores); - auto selected_items = ToMap(items, high_level.back()); - VLOG(3) << "selected_items:"; - for (size_t i = 0; i < selected_items.size(); ++i) { - VLOG(3) << "offset:" << i; - for (auto &item : selected_items[i]) { - VLOG(3) << ItemToString(item); - } - } - - PruneEndBeams(pre_ids, &selected_items); - // calculate the output tensor's height - size_t num_instances = std::accumulate( - std::begin(selected_items), std::end(selected_items), 0, - [](size_t a, std::vector &b) { return a + b.size(); }); - // the output tensor shape should be [num_instances, 1] - auto dims = framework::make_ddim( - std::vector({static_cast(num_instances), 1})); - selected_ids->Resize(dims); - selected_scores->Resize(dims); - - std::map> hash; - framework::LoD new_lod; - auto *ids_data = selected_ids->mutable_data(platform::CPUPlace()); - auto *scores_data = - selected_scores->mutable_data(platform::CPUPlace()); - - // fill in data - std::vector low_level; - size_t low_offset = 0; - for (auto &items : selected_items) { - low_level.push_back(low_offset); - for (auto &item : items) { - ids_data[low_offset] = item.id; - scores_data[low_offset] = item.score; - low_offset++; - } - } - low_level.push_back(low_offset); - - // fill lod - framework::LoD lod(2); - lod[0].assign(high_level.begin(), high_level.end()); - lod[1].assign(low_level.begin(), low_level.end()); - if (!framework::CheckLoD(lod)) { - PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); - } - selected_ids->set_lod(lod); - selected_scores->set_lod(lod); -} - -void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids, - std::vector> *items) { - auto *pre_ids_data = pre_ids.data(); - auto abs_lod = framework::ToAbsOffset(ids_->lod()); - auto &high_level = abs_lod[lod_level_]; - for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { - size_t src_prefix_start = high_level[src_idx]; - size_t src_prefix_end = high_level[src_idx + 1]; - bool finish_flag = true; - for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) { - for (auto &item : items->at(offset)) { - if (item.id != static_cast(end_id_) || - pre_ids_data[offset] != end_id_) { - finish_flag = false; - break; - } - } - if (!finish_flag) break; - } - if (finish_flag) { // all branchs of the beam (source sentence) end and - // prune this beam - for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) - items->at(offset).clear(); - } - } -} - -std::vector> BeamSearch::ToMap( - const std::vector> &items, size_t element_num) { - std::vector> result; - result.resize(element_num); - for (auto &entries : items) { - for (const auto &item : entries) { - result[item.offset].push_back(item); - } - } - return result; -} - -std::vector> BeamSearch::SelectTopBeamSizeItems( - const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores) { - std::vector> result; - std::vector items; - // for each source sentence, select the top beam_size items across all - // candidate sets. - while (NextItemSet(pre_ids, pre_scores, &items)) { - std::nth_element( - std::begin(items), std::begin(items) + beam_size_, std::end(items), - [](const Item &a, const Item &b) { return a.score > b.score; }); - // prune the top beam_size items. - if (items.size() > beam_size_) { - items.resize(beam_size_); - } - result.emplace_back(items); - } - VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); - for (auto &items : result) { - VLOG(3) << "item set:"; - for (auto &item : items) { - VLOG(3) << ItemToString(item); - } - } - - return result; -} - -// the candidates of a source -bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids, - const framework::LoDTensor &pre_scores, - std::vector *items) { - if (sent_offset_ >= ids_->NumElements(lod_level_)) { - return false; - } - // find the current candidates - auto ids = *ids_; - auto scores = *scores_; - - auto abs_lod = framework::ToAbsOffset(ids.lod()); - - auto *ids_data = ids.data(); - auto *scores_data = scores.data(); - - size_t instance_dim = 1; - for (int i = 1; i < ids.dims().size(); i++) { - instance_dim *= ids.dims()[i]; - } - - auto *pre_ids_data = pre_ids.data(); - auto *pre_scores_data = pre_scores.data(); - items->clear(); - items->reserve(framework::product(ids.dims())); - for (size_t offset = abs_lod[lod_level_][sent_offset_]; - offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { - auto pre_id = pre_ids_data[offset]; - auto pre_score = pre_scores_data[offset]; - if (pre_id == end_id_) { - // Allocate all probability mass to eos_id for finished branchs and the - // other candidate ids can be ignored. - items->emplace_back(offset, end_id_, pre_score); - } else { - for (size_t d = 0; d < instance_dim; d++) { - const size_t dim_offset = offset * instance_dim + d; - items->emplace_back(offset, ids_data[dim_offset], - scores_data[dim_offset]); - } - } - } - - sent_offset_++; - return true; -} - -std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) { - os << "{"; - os << "offset: " << item.offset << ", "; - os << "id: " << item.id << ", "; - os << "score: " << item.score << ""; - os << "}"; - - return os; -} - -std::string ItemToString(const BeamSearch::Item &item) { - std::ostringstream stream; - stream << item; - return stream.str(); -} - class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor) The LoDTensor containing the selected ids at the " "previous step. It should be a tensor with shape (batch_size, 1) " "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at " - "thefirst step."); + "the first step."); AddInput("pre_scores", "(LoDTensor) The LoDTensor containing the accumulated " "scores corresponding to the selected ids at the previous step."); AddInput("ids", "(LoDTensor) The LoDTensor containing the candidates ids. Its " - "shape should be (batch_size * beam_size, K), where K supposed to " - "be beam_size."); + "shape should be (batch_size * beam_size, W). If not set, it will " + "be calculated out according to Input(scores) in this operator.") + .AsDispensable(); AddInput("scores", - "(LoDTensor) The LodTensor containing the accumulated scores " - "corresponding to Input(ids) and its shape is the same as the " - "shape of Input(ids)."); + "(LoDTensor) The LoDTensor containing the current scores " + "corresponding to Input(ids). If Input(ids) is not nullptr, its " + "shape is the same as that of Input(ids)." + "If is_accumulated is true, Input(scores) is accumulated scores " + "and will be used derectedly. Else, each score will be " + "transformed to the log field and accumulate Input(pre_sores) " + "first."); AddOutput("selected_ids", "A LodTensor that stores the IDs selected by beam search."); AddOutput("selected_scores", @@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("beam_size", "beam size for beam search"); AddAttr("end_id", "the token id which indicates the end of a sequence"); + AddAttr("is_accumulated", + "Whether the Input(scores) is accumulated scores.") + .SetDefault(true); AddComment(R"DOC( This operator does the search in beams for one time step. @@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: void InferShape(framework::InferShapeContext *ctx) const override { for (const std::string &arg : - std::vector({"pre_ids", "ids", "scores"})) { + std::vector({"pre_ids", "scores"})) { PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'", arg); } @@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel { } } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - framework::OpKernelType kt = framework::OpKernelType( - ctx.Input("pre_ids")->type(), - platform::CPUPlace()); - return kt; + auto *scores = ctx.Input("scores"); + size_t level = ctx.Attr("level"); + size_t batch_size = scores->lod()[level].size() - 1; + // The current CUDA kernel only support cases with batch_size < 4. + // Compute on CPU for cases with batch_size > 4. + if (batch_size <= 4) { + return framework::OpKernelType( + ctx.Input("pre_ids")->type(), ctx.GetPlace()); + } else { + return framework::OpKernelType( + ctx.Input("pre_ids")->type(), + platform::CPUPlace()); + } } }; diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc new file mode 100644 index 0000000000..4ef9476eee --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index b5e2ed0592..1b939e742d 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -14,187 +14,12 @@ limitations under the License. */ #pragma once -#include -#include -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/beam_search.h" namespace paddle { namespace operators { -/* - * This is an implementation of beam search. - * - * To explain the details, lets take machine translation task for example, in - * this task, one source sentence is translated to multiple target sentences, - * during this period, one sentence will be translated to multiple translation - * prefixes(target sentence that have not ended), in each time step a prefix - * will have some candidates, input the candidate ids and their corresponding - * scores (probabilities), it will sort and select the top beam_size candidates - * for each source sentence, and store the selected candidates's score and their - * corresponding ids to LoDTensors. - * - * A detailed example: - * - * Input - * - * ids: - * LoD (should have 2 levels) - * first level: [0, 1, 4] - * second level: [0, 1, 2, 3, 4] - * - * tensor's data - * [ - * [4, 2, 5] - * [2, 1, 3] - * [3, 5, 2] - * [8, 2, 1] - * ] - * - * scores: - * LoD same as `ids` - * tensor's data - * [ - * [0.5, 0.3, 0.2] - * [0.6, 0.3, 0.1] - * [0.9, 0.5, 0.1] - * [0.7, 0.5, 0.1] - * ] - * - * the inputs means that there are 2 source sentences to translate, and the - * first source has 1 prefix, the second source has 2 prefix. - * - * lets assume beam size is 2, and the beam search's output should be - * LoD - * first level: - * [0, 1, 2] - * second level: - * [0, 2, 4] - * - * id tensor's data - * [[ - * 4, - * 1, - * 3, - * 8, - * ]] - * - * score tensor's data - * [[ - * 0.5, - * 0.3, - * 0.9, - * 0.7 - * ]] - * - * TODO all the prune operations should be in the beam search, so it is better - * to split the beam search algorithm into a sequence of smaller operators, and - * the prune operators can be inserted in this sequence. - */ -class BeamSearch { - public: - // TODO(superjom) make type customizable - using id_t = size_t; - using score_t = float; - /* - * Input the arguments that needed by this class. - */ - BeamSearch(const framework::LoDTensor& ids, - const framework::LoDTensor& scores, size_t level, size_t beam_size, - int end_id) - : beam_size_(beam_size), - ids_(&ids), - scores_(&scores), - lod_level_(level), - end_id_(end_id) {} - - /* - * The main function of beam search. - * - * @selected_ids: a [None, 1]-shaped tensor with LoD. - * In a machine translation model, it might be the candidate term id sets, - * each set stored as a varience-length sequence. - * The format might be described with a two-level LoD - * - [[0 1] - * - [0 1 2]] - * - [[] - * - [0 1]] - * the first level of LoD tells that there are two source sentences. The - * second level describes the details of the candidate id set's offsets in - * the - * source sentences. - * - * @selected_scores: a LoD tensor with the same shape and LoD with - * selected_ids. - * It stores the corresponding scores of candidate ids in selected_ids. - * - * Return false if all the input tensor is empty, in machine translation task - * that means no candidates is provided, and the task will stop running. - */ - void operator()(const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores, - framework::LoDTensor* selected_ids, - framework::LoDTensor* selected_scores); - /* - * The basic items help to sort. - */ - struct Item { - Item() {} - Item(size_t offset, size_t id, float score) - : offset(offset), id(id), score(score) {} - // offset in the higher lod level. - size_t offset; - // // prefix id in the lower lod level. - // size_t prefix; - // the candidate id - id_t id; - // the corresponding score - score_t score; - }; - - protected: - /* - * Prune the source sentences all branchs finished, and it is optional. - * Pruning must one step later than finishing (thus pre_ids is needed here), - * since the end tokens must be writed out. - */ - void PruneEndBeams(const framework::LoDTensor& pre_ids, - std::vector>* items); - - /* - * Transform the items into a map whose key is offset, value is the items. - * NOTE low performance. - */ - std::vector> ToMap( - const std::vector>& inputs, size_t element_num); - - /* - * For each source, select top beam_size records. - */ - std::vector> SelectTopBeamSizeItems( - const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores); - - /* - * Get the items of next source sequence, return false if no remaining items. - */ - bool NextItemSet(const framework::LoDTensor& pre_ids, - const framework::LoDTensor& pre_scores, - std::vector* items); - - private: - size_t beam_size_; - const framework::LoDTensor* ids_; - const framework::LoDTensor* scores_; - size_t lod_level_{0}; - size_t sent_offset_{0}; - int end_id_{0}; -}; - -std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); - -std::string ItemToString(const BeamSearch::Item& item); - template class BeamSearchOpKernel : public framework::OpKernel { public: @@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel { auto* scores = context.Input("scores"); auto* pre_ids = context.Input("pre_ids"); auto* pre_scores = context.Input("pre_scores"); - PADDLE_ENFORCE_NOT_NULL(ids); + PADDLE_ENFORCE_NOT_NULL(scores); PADDLE_ENFORCE_NOT_NULL(pre_ids); PADDLE_ENFORCE_NOT_NULL(pre_scores); @@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel { size_t level = context.Attr("level"); size_t beam_size = context.Attr("beam_size"); int end_id = context.Attr("end_id"); - BeamSearch alg(*ids, *scores, level, beam_size, end_id); + bool is_accumulated = context.Attr("is_accumulated"); + auto selected_ids = context.Output("selected_ids"); auto selected_scores = context.Output("selected_scores"); PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_scores); - alg(*pre_ids, *pre_scores, selected_ids, selected_scores); + + math::BeamSearchFunctor alg; + alg(context.template device_context(), pre_ids, pre_scores, + ids, scores, selected_ids, selected_scores, level, beam_size, end_id, + is_accumulated); } }; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc deleted file mode 100644 index 40b46781da..0000000000 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/operators/beam_search_op.h" - -#include -#include - -namespace paddle { -namespace test { - -using std::vector; -using framework::LoDTensor; -using framework::LoD; -using operators::BeamSearch; -using paddle::platform::CPUPlace; -using std::cout; -using std::endl; - -void CreateInput(LoDTensor* ids, LoDTensor* scores) { - LoD lod; - vector level0({0, 2, 4}); - vector level1({0, 1, 2, 3, 4}); - lod.push_back(level0); - lod.push_back(level1); - ids->set_lod(lod); - scores->set_lod(lod); - - auto dims = framework::make_ddim(vector({4, 3})); - ids->Resize(dims); - scores->Resize(dims); - CPUPlace place; - - auto* ids_data = ids->mutable_data(place); - auto* scores_data = scores->mutable_data(place); - vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); - vector _scores( - {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); - - for (int i = 0; i < 12; i++) { - ids_data[i] = _ids[i]; - scores_data[i] = _scores[i]; - } -} - -// It seems that beam_search_op has bugs. -TEST(DISABLED_beam_search_op, run) { - CPUPlace place; - LoDTensor ids, scores; - CreateInput(&ids, &scores); - - LoDTensor pre_ids; - pre_ids.Resize(framework::make_ddim(vector(4, 1))); - for (int i = 0; i < 4; i++) { - pre_ids.mutable_data(place)[i] = i + 1; - } - LoDTensor pre_scores; - pre_scores.Resize(framework::make_ddim(vector(4, 1))); - for (int i = 0; i < 4; i++) { - pre_scores.mutable_data(place)[i] = 0.1 * (i + 1); - } - - BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0); - LoDTensor sids, sscores; - beamsearch(pre_ids, pre_scores, &sids, &sscores); - - LOG(INFO) << "score: " << sscores << endl; - - ASSERT_EQ(sids.lod(), sscores.lod()); - - vector tids({4, 2, 3, 8}); - vector tscores({0.5f, 0.6f, 0.9f, 0.7f}); - - for (int i = 0; i < 4; i++) { - ASSERT_EQ(tids[i], sids.data()[i]); - ASSERT_EQ(tscores[i], sscores.data()[i]); - } -} - -} // namespace test -} // namespace paddle diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index e223be7af8..f9570e4e2e 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel { auto* label = ctx.Input("Label"); auto* dx = ctx.Output(framework::GradVarName("X")); - const int step_size = x->dims()[0]; - const int num_classes = x->dims()[1]; + const size_t step_size = static_cast(x->dims()[0]); + const size_t num_classes = static_cast(x->dims()[1]); T* dx_data = dx->mutable_data(ctx.GetPlace()); const T* dy_data = dy->data(); const T* x_data = x->data(); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index dc27e543f0..6bbb7155dd 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -54,6 +54,7 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) math_library(softmax DEPS math_function) +math_library(beam_search DEPS math_function) math_library(matrix_bit_code) @@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling) +cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search) if(WITH_GPU) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function) diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc new file mode 100644 index 0000000000..fb7119273a --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.cc @@ -0,0 +1,283 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include +#include + +namespace paddle { +namespace operators { +namespace math { + +template +class BeamSearchFunctor { + public: + void operator()(const platform::CPUDeviceContext &context, + const framework::LoDTensor *pre_ids, + const framework::LoDTensor *pre_scores, + const framework::LoDTensor *ids, + const framework::LoDTensor *scores, + framework::LoDTensor *selected_ids, + framework::LoDTensor *selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated) { + auto abs_lod = framework::ToAbsOffset(scores->lod()); + auto &high_level = abs_lod[level]; + + auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level, + beam_size, end_id, is_accumulated); + auto selected_items = ToMap(items, high_level.back()); + if (FLAGS_v == 3) { + VLOG(3) << "selected_items:"; + for (size_t i = 0; i < selected_items.size(); ++i) { + VLOG(3) << "offset: " << i; + for (auto &item : selected_items[i]) { + VLOG(3) << item.ToString(); + } + } + } + + PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id); + // calculate the output tensor's height + size_t num_instances = std::accumulate( + std::begin(selected_items), std::end(selected_items), 0, + [](size_t a, std::vector &b) { return a + b.size(); }); + // the output tensor shape should be [num_instances, 1] + auto dims = framework::make_ddim( + std::vector({static_cast(num_instances), 1})); + selected_ids->Resize(dims); + selected_scores->Resize(dims); + + auto *selected_ids_data = + selected_ids->mutable_data(platform::CPUPlace()); + auto *selected_scores_data = + selected_scores->mutable_data(platform::CPUPlace()); + + // fill in data + std::vector low_level; + size_t low_offset = 0; + for (auto &items : selected_items) { + low_level.push_back(low_offset); + for (auto &item : items) { + selected_ids_data[low_offset] = item.id; + selected_scores_data[low_offset] = item.score; + low_offset++; + } + } + low_level.push_back(low_offset); + + // fill lod + framework::LoD lod(2); + lod[0].assign(high_level.begin(), high_level.end()); + lod[1].assign(low_level.begin(), low_level.end()); + if (!framework::CheckLoD(lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + } + selected_ids->set_lod(lod); + selected_scores->set_lod(lod); + } + + /* + * The basic items help to sort. + */ + struct Item { + Item() {} + Item(size_t offset, size_t id, float score) + : offset(offset), id(id), score(score) {} + // offset in the higher lod level. + size_t offset; + // prefix id in the lower lod level. + // size_t prefix; + // the candidate id + size_t id; + // the corresponding score + float score; + + inline bool operator<(const Item &in) const { + return (score < in.score) || + ((score == in.score) && (offset < in.offset)); + } + + inline void operator=(const Item &in) { + offset = in.offset; + id = in.id; + score = in.score; + } + + std::string ToString() { + std::ostringstream os; + os << "{"; + os << "offset: " << offset << ", "; + os << "id: " << id << ", "; + os << "score: " << score << ""; + os << "}"; + return os.str(); + } + }; + + protected: + /* + * Prune the source sentences all branchs finished, and it is optional. + * Pruning must one step later than finishing (thus pre_ids is needed here), + * since the end tokens must be writed out. + */ + void PruneEndBeams(const framework::LoDTensor *pre_ids, + const framework::LoD &abs_lod, + std::vector> *items, size_t lod_level, + int end_id) { + auto *pre_ids_data = pre_ids->data(); + auto &high_level = abs_lod[lod_level]; + for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { + size_t src_prefix_start = high_level[src_idx]; + size_t src_prefix_end = high_level[src_idx + 1]; + bool finish_flag = true; + for (size_t offset = src_prefix_start; offset < src_prefix_end; + offset++) { + for (auto &item : items->at(offset)) { + if (item.id != static_cast(end_id) || + pre_ids_data[offset] != end_id) { + finish_flag = false; + break; + } + } + if (!finish_flag) break; + } + if (finish_flag) { // all branchs of the beam (source sentence) end and + // prune this beam + for (size_t offset = src_prefix_start; offset < src_prefix_end; + offset++) + items->at(offset).clear(); + } + } + } + + /* + * Transform the items into a map whose key is offset, value is the items. + * NOTE low performance. + */ + std::vector> ToMap( + const std::vector> &items, size_t element_num) { + std::vector> result; + result.resize(element_num); + for (auto &entries : items) { + for (const auto &item : entries) { + result[item.offset].push_back(item); + } + } + return result; + } + + void Insert(std::vector *top_beam_ptr, const Item &item, + size_t beam_size) { + std::vector &top_beam = *top_beam_ptr; + + size_t num_beams = top_beam.size(); + if (num_beams < beam_size) { + top_beam.resize(num_beams + 1); + num_beams++; + } else { + if (item < top_beam[beam_size - 1]) { + return; + } + } + + for (int k = static_cast(num_beams) - 2; k >= 0; --k) { + if (top_beam[k] < item) { + top_beam[k + 1] = top_beam[k]; + } else { + top_beam[k + 1] = item; + return; + } + } + top_beam[0] = item; + } + + /* + * For each source, select top beam_size records. + */ + std::vector> SelectTopBeamSizeItems( + const framework::LoDTensor *pre_ids, + const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids, + const framework::LoDTensor *scores, size_t lod_level, size_t beam_size, + int end_id, bool is_accumulated) { + std::vector> result; + + // find the current candidates + auto abs_lod = framework::ToAbsOffset(scores->lod()); + + auto *pre_ids_data = pre_ids->data(); + auto *pre_scores_data = pre_scores->data(); + + auto *ids_data = ids ? ids->data() : nullptr; + auto *scores_data = scores->data(); + + size_t num_seqs = scores->NumElements(lod_level); + size_t seq_width = 1; + for (int i = 1; i < scores->dims().size(); i++) { + seq_width *= scores->dims()[i]; + } + + for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) { + size_t seq_offset_start = abs_lod[lod_level][seq_id]; + size_t seq_offset_end = abs_lod[lod_level][seq_id + 1]; + + std::vector top_beam; + top_beam.reserve(beam_size); + + for (size_t offset = seq_offset_start; offset < seq_offset_end; + ++offset) { + auto pre_id = pre_ids_data[offset]; + auto pre_score = pre_scores_data[offset]; + if (pre_id == end_id) { + // Allocate all probability mass to end_id for finished branchs and + // the other candidate ids can be ignored. + Item item(offset, end_id, pre_score); + Insert(&top_beam, item, beam_size); + } else { + size_t index = offset * seq_width; + for (size_t d = 0; d < seq_width; d++, index++) { + int64_t id = ids_data ? ids_data[index] : static_cast(d); + float score = is_accumulated + ? scores_data[index] + : pre_score + std::log(scores_data[index]); + Item item(offset, id, score); + Insert(&top_beam, item, beam_size); + } + } + } + + result.emplace_back(top_beam); + } + + if (FLAGS_v == 3) { + VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); + for (auto &items : result) { + VLOG(3) << "item set:"; + for (auto &item : items) { + VLOG(3) << item.ToString(); + } + } + } + + return result; + } +}; + +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu new file mode 100644 index 0000000000..d94e3023ce --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.cu @@ -0,0 +1,393 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include "paddle/fluid/platform/cuda_device_function.h" + +namespace paddle { +namespace operators { +namespace math { + +struct Triple { + __device__ __forceinline__ Triple() {} + __device__ __forceinline__ Triple(int o, int i, float s) + : offset(o), id(i), score(s) {} + + __device__ __forceinline__ void set(int o, int i, float s) { + offset = o; + id = i; + score = s; + } + + __device__ __forceinline__ void operator=(const Triple& in) { + offset = in.offset; + id = in.id; + score = in.score; + } + + __device__ __forceinline__ bool operator<(const float s) const { + return score < s; + } + + __device__ __forceinline__ bool operator<(const Triple& in) const { + return (score < in.score) || ((score == in.score) && (offset < in.offset)); + } + + int offset; + int id; + float score; +}; + +__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p, + int beam_size) { + if (p < top_beam[beam_size - 1]) { + return; + } + for (int k = beam_size - 2; k >= 0; --k) { + if (top_beam[k] < p) { + top_beam[k + 1] = top_beam[k]; + } else { + top_beam[k + 1] = p; + return; + } + } + top_beam[0] = p; +} + +template +__device__ __forceinline__ int SelectTopBeam( + Triple* top_beam, const int64_t* pre_ids, const float* pre_scores, + const int64_t* ids, const float* scores, const int seq_offset_start, + const int seq_offset_end, const int seq_width, int beam_size, int end_id, + int used_threads) { + // top_beam is shared memory + const int tid = threadIdx.x; + const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq; + + int num_used_threads = used_threads; + + Triple* top_beam_local = top_beam + tid * beam_size; + if (tid_of_seq < num_used_threads) { + for (int i = 0; i < beam_size; ++i) { + top_beam_local[i].set(-1, -1, -INFINITY); + } + + for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) { + int pre_id = static_cast(pre_ids[offset]); + if (pre_id == end_id) { + if (tid_of_seq == 0) { + Triple tmp(offset, end_id, pre_scores[offset]); + Insert(top_beam_local, tmp, beam_size); + } + } else { + int index = offset * seq_width + tid_of_seq; + if (!IsAccumulated) { + float pre_score = pre_scores[offset]; + for (int i = tid_of_seq; i < seq_width; i += num_used_threads) { + float score = pre_score + __logf(scores[index]); + int id = ids ? static_cast(ids[index]) : i; + Triple tmp(offset, id, score); + Insert(top_beam_local, tmp, beam_size); + index += num_used_threads; + } + } else { + for (int i = tid_of_seq; i < seq_width; i += num_used_threads) { + int id = ids ? static_cast(ids[index]) : i; + float score = scores[index]; + Triple tmp(offset, id, score); + Insert(top_beam_local, tmp, beam_size); + index += num_used_threads; + } + } + } + } + } + + while (num_used_threads > 1) { + if (num_used_threads > 16) { + __syncthreads(); + } + + num_used_threads = num_used_threads >> 1; + if (tid_of_seq < num_used_threads) { + int index_in_sh = (num_used_threads + tid) * beam_size; + for (int i = 0; i < beam_size; i++) { + Insert(top_beam_local, top_beam[index_in_sh], beam_size); + index_in_sh++; + } + } + } + + if (tid_of_seq == 0) { + int num_items = 0; + for (int i = 0; i < beam_size; ++i) { + num_items = + (top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items; + } + return num_items; + } + + return 0; +} + +__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, + const int64_t* pre_ids, + const int end_id, int num_items) { + bool finish_flag = true; + for (int i = 0; i < num_items; ++i) { + int offset = top_beam_local[i].offset; + if (top_beam_local[i].id != end_id || + static_cast(pre_ids[offset]) != end_id) { + finish_flag = false; + break; + } + } + return finish_flag; +} + +__device__ __forceinline__ void WriteBack( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + Triple* top_beam_local, const int seq_offset_start, + const int seq_offset_end, const int selected_seq_start, + const int selected_seq_length) { + const int tid = threadIdx.x; // use 1 thread only for each sequence + int global_index = selected_seq_start; + for (int global_offset = seq_offset_start; global_offset < seq_offset_end; + ++global_offset) { + for (int local_index = 0; local_index < selected_seq_length; + ++local_index) { + if (top_beam_local[local_index].offset == global_offset) { + selected_ids[global_index] = + static_cast(top_beam_local[local_index].id); + selected_scores[global_index] = top_beam_local[local_index].score; + global_index++; + } + } + selected_offsets[global_offset + 1] = static_cast(global_index); + } +} + +template +__device__ void BeamSearchDetails( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, + const float* scores, const int seq_offset_start, const int seq_offset_end, + const int seq_width, int beam_size, int end_id, bool is_accumulated, + int num_used_threads) { + __shared__ Triple top_beam[MaxLength]; + + int num_items = 0; + if (is_accumulated) { + num_items = SelectTopBeam( + top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start, + seq_offset_end, seq_width, beam_size, end_id, num_used_threads); + } else { + num_items = SelectTopBeam( + top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start, + seq_offset_end, seq_width, beam_size, end_id, num_used_threads); + } + + const int tid = threadIdx.x; // use 1 thread only for each sequence + const int tid_of_seq = tid % MaxThreadsPerSeq; + if (tid_of_seq == 0) { + // Use 1 thread for each sequence. + Triple* top_beam_local = top_beam + tid * beam_size; + bool finish_flag = + PruneEndBeams(top_beam_local, pre_ids, end_id, num_items); + + int selected_seq_start = 0; + int selected_seq_length = finish_flag ? 0 : num_items; + + if (MaxSeqs > 1) { + const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid; + __shared__ int shared_mem[MaxSeqs]; + + // [0, MaxSeqs - 1], length of each sequences + shared_mem[seq_id] = selected_seq_length; + __syncthreads(); + + for (int s = 0; s < seq_id; ++s) { + selected_seq_start += shared_mem[s]; + } + + if (seq_id == 0) { + selected_offsets[0] = 0; + } + } else { + selected_offsets[0] = 0; + } + + WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local, + seq_offset_start, seq_offset_end, selected_seq_start, + selected_seq_length); + } +} + +template +__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, + size_t* selected_offsets, + const int64_t* pre_ids, + const float* pre_scores, const int64_t* ids, + const float* scores, const size_t* seq_offsets, + const int num_seqs, const int seq_width, + int beam_size, int end_id, bool is_accumulated, + int num_used_threads) { + const int tid = threadIdx.x; + const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid; + + int seq_offset_start = static_cast(seq_offsets[seq_id]); + int seq_offset_end = static_cast(seq_offsets[seq_id + 1]); + + BeamSearchDetails( + selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, + scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, + is_accumulated, num_used_threads); +} + +template +__global__ void BeamSearchKernelSingle( + int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, + const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, + const float* scores, const int seq_length, const int seq_width, + int beam_size, int end_id, bool is_accumulated, int num_used_threads) { + const int seq_offset_start = 0; + const int seq_offset_end = seq_length; + + BeamSearchDetails( + selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, + scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, + is_accumulated, num_used_threads); +} + +static inline int GetNumUsedThreads(const int max_threads_per_seq, + const int seq_width, int beam_size) { + int num_used_threads = (seq_width + beam_size - 1) / beam_size; + num_used_threads = max_threads_per_seq < num_used_threads + ? max_threads_per_seq + : num_used_threads; + + num_used_threads = + num_used_threads > 32 + ? (num_used_threads >> 5) << 5 + : (num_used_threads > 16 + ? 32 + : (num_used_threads > 8 + ? 16 + : (num_used_threads > 4 + ? 8 + : (num_used_threads > 2 ? 4 + : num_used_threads)))); + return num_used_threads; +} + +template +class BeamSearchFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::LoDTensor* pre_ids, + const framework::LoDTensor* pre_scores, + const framework::LoDTensor* ids, + const framework::LoDTensor* scores, + framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated) { + auto abs_lod = framework::ToAbsOffset(scores->lod()); + + const int64_t* pre_ids_data = pre_ids->data(); + const float* pre_scores_data = pre_scores->data(); + const int64_t* ids_data = ids ? ids->data() : nullptr; + const float* scores_data = scores->data(); + + const size_t num_seqs = abs_lod[level].size() - 1; + size_t seq_width = 1; + for (int i = 1; i < scores->dims().size(); i++) { + seq_width *= scores->dims()[i]; + } + + // Reserve a big enough memory. + auto selected_dims = + framework::make_ddim({static_cast(num_seqs * beam_size), 1}); + int64_t* selected_ids_data = + selected_ids->mutable_data(selected_dims, context.GetPlace()); + float* selected_scores_data = + selected_scores->mutable_data(selected_dims, context.GetPlace()); + + framework::LoD selected_lod(2); + selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); + selected_lod[1].resize(scores->dims()[0] + 1); + size_t* selected_offsets = + selected_lod[1].CUDAMutableData(context.GetPlace()); + + if (num_seqs == 1) { + const int seq_length = static_cast(abs_lod[level][1]); + const int kMaxThreadsPerSeq = 1024; + int num_used_threads = + GetNumUsedThreads(kMaxThreadsPerSeq, static_cast(seq_width), + static_cast(beam_size)); + switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) { + CUDA_LAUNCH_KERNEL_HELPER( + BeamSearchKernelSingle<<< + 1, kMaxThreadsPerSeq, 0, context.stream()>>>( + selected_ids_data, selected_scores_data, selected_offsets, + pre_ids_data, pre_scores_data, ids_data, scores_data, + seq_length, static_cast(seq_width), + static_cast(beam_size), static_cast(end_id), + is_accumulated, num_used_threads)); + } + } else if (num_seqs <= 4) { + const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace()); + // Use only 1 block + const int kMaxThreadsPerSeq = 32; + const int kMaxSeqs = 4; + int num_used_threads = + GetNumUsedThreads(kMaxThreadsPerSeq, static_cast(seq_width), + static_cast(beam_size)); + switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) { + CUDA_LAUNCH_KERNEL_HELPER( + BeamSearchKernel<<< + 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>( + selected_ids_data, selected_scores_data, selected_offsets, + pre_ids_data, pre_scores_data, ids_data, scores_data, + seq_offsets, static_cast(num_seqs), + static_cast(seq_width), static_cast(beam_size), + end_id, is_accumulated, num_used_threads)); + } + } else { + LOG(FATAL) << "Not implemented."; + } + + context.Wait(); + if (!framework::CheckLoD(selected_lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod)); + } + + selected_ids->set_lod(selected_lod); + selected_scores->set_lod(selected_lod); + if (selected_lod[1].back() < num_seqs * beam_size) { + auto final_selected_dims = framework::make_ddim( + {static_cast(selected_lod[1].back()), 1}); + selected_ids->Resize(final_selected_dims); + selected_scores->Resize(final_selected_dims); + } + } +}; + +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h new file mode 100644 index 0000000000..3cd17f426c --- /dev/null +++ b/paddle/fluid/operators/math/beam_search.h @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * This is an implementation of beam search. + * + * To explain the details, lets take machine translation task for example, in + * this task, one source sentence is translated to multiple target sentences, + * during this period, one sentence will be translated to multiple translation + * prefixes(target sentence that have not ended), in each time step a prefix + * will have some candidates, input the candidate ids and their corresponding + * scores (probabilities), it will sort and select the top beam_size candidates + * for each source sentence, and store the selected candidates's score and their + * corresponding ids to LoDTensors. + * + * A detailed example: + * + * Input + * + * ids: + * - LoD (should have 2 levels) + * - first level: [0, 1, 4] + * - second level: [0, 1, 2, 3, 4] + * - tensor's data: + * [[4, 2, 5] + * [2, 1, 3] + * [3, 5, 2] + * [8, 2, 1]] + * + * scores: + * - LoD same as `ids` + * - tensor's data + * [[0.5, 0.3, 0.2] + * [0.6, 0.3, 0.1] + * [0.9, 0.5, 0.1] + * [0.7, 0.5, 0.1]] + * + * The inputs means that there are 2 source sentences to translate, and the + * first source has 1 prefix, the second source has 2 prefix. + * + * Lets assume beam size is 2, and the beam search's output should be + * - LoD + * - first level: [0, 1, 2] + * - second level: [0, 2, 4] + * - id tensor's data + * [[4, + * 1, + * 3, + * 8]] + * - score tensor's data + * [[0.5, + * 0.3, + * 0.9, + * 0.7]] + * + * TODO all the prune operations should be in the beam search, so it is better + * to split the beam search algorithm into a sequence of smaller operators, and + * the prune operators can be inserted in this sequence. + */ +template +class BeamSearchFunctor { + public: + /* + * The main function of beam search. + * + * @selected_ids: a [None, 1]-shaped tensor with LoD. + * In a machine translation model, it might be the candidate term id sets, + * each set stored as a varience-length sequence. + * The format might be described with a two-level LoD + * - [[0 1], + * [0 1 2]] + * - [[] + * [0 1]] + * the first level of LoD tells that there are two source sentences. The + * second level describes the details of the candidate id set's offsets in + * the source sentences. + * + * @selected_scores: a LoD tensor with the same shape and LoD with + * selected_ids. + * It stores the corresponding scores of candidate ids in selected_ids. + * + * Return false if all the input tensor is empty, in machine translation task + * that means no candidates is provided, and the task will stop running. + */ + void operator()(const DeviceContext& context, + const framework::LoDTensor* pre_ids, + const framework::LoDTensor* pre_scores, + const framework::LoDTensor* ids, + const framework::LoDTensor* scores, + framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores, size_t level, + size_t beam_size, int end_id, bool is_accumulated); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc new file mode 100644 index 0000000000..1c29ee95f6 --- /dev/null +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/beam_search.h" +#include +#include + +void PrepareCPUTensors(paddle::framework::LoDTensor* ids, + paddle::framework::LoDTensor* scores, + paddle::framework::LoDTensor* pre_ids, + paddle::framework::LoDTensor* pre_scores) { + // lod + paddle::framework::LoD lod; + std::vector level0({0, 2, 4}); + std::vector level1({0, 1, 2, 3, 4}); + lod.push_back(level0); + lod.push_back(level1); + ids->set_lod(lod); + scores->set_lod(lod); + + auto dims = paddle::framework::make_ddim({4, 3}); + ids->Resize(dims); + scores->Resize(dims); + + paddle::platform::CPUPlace place; + auto* ids_data = ids->mutable_data(place); + auto* scores_data = scores->mutable_data(place); + std::vector ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); + std::vector scores_vec_data( + {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); + + CHECK_EQ(static_cast(ids->numel()), ids_vec_data.size()); + CHECK_EQ(static_cast(ids->numel()), scores_vec_data.size()); + + for (int i = 0; i < ids->numel(); i++) { + ids_data[i] = ids_vec_data[i]; + scores_data[i] = scores_vec_data[i]; + } + + // pre_ids + pre_ids->Resize(paddle::framework::make_ddim({4, 1})); + for (int i = 0; i < 4; i++) { + pre_ids->mutable_data(place)[i] = i + 1; + } + + // pre_scores + pre_scores->Resize(paddle::framework::make_ddim({4, 1})); + for (int i = 0; i < 4; i++) { + pre_scores->mutable_data(place)[i] = 0.1 * (i + 1); + } +} + +template +void TestBeamSearch() { + paddle::framework::LoDTensor ids; + paddle::framework::LoDTensor scores; + paddle::framework::LoDTensor pre_ids; + paddle::framework::LoDTensor pre_scores; + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores); + } else { + paddle::framework::LoDTensor cpu_ids; + paddle::framework::LoDTensor cpu_scores; + paddle::framework::LoDTensor cpu_pre_ids; + paddle::framework::LoDTensor cpu_pre_scores; + + PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores); + + TensorCopySync(cpu_ids, *place, &ids); + TensorCopySync(cpu_scores, *place, &scores); + TensorCopySync(cpu_pre_ids, *place, &pre_ids); + TensorCopySync(cpu_pre_scores, *place, &pre_scores); + + ids.set_lod(cpu_ids.lod()); + scores.set_lod(cpu_scores.lod()); + pre_ids.set_lod(cpu_pre_ids.lod()); + pre_scores.set_lod(cpu_pre_scores.lod()); + } + + paddle::framework::LoDTensor selected_ids; + paddle::framework::LoDTensor selected_scores; + + size_t level = 0; + size_t beam_size = 2; + int end_id = 0; + paddle::operators::math::BeamSearchFunctor beamsearch; + beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids, + &selected_scores, level, beam_size, end_id, true); + + ASSERT_EQ(selected_ids.lod(), selected_scores.lod()); + + paddle::framework::LoDTensor cpu_selected_ids; + paddle::framework::LoDTensor cpu_selected_scores; + if (paddle::platform::is_cpu_place(*place)) { + cpu_selected_ids = selected_ids; + cpu_selected_scores = selected_scores; + } else { + TensorCopySync(selected_ids, paddle::platform::CPUPlace(), + &cpu_selected_ids); + TensorCopySync(selected_scores, paddle::platform::CPUPlace(), + &cpu_selected_scores); + cpu_selected_ids.set_lod(selected_ids.lod()); + cpu_selected_scores.set_lod(selected_scores.lod()); + } + + std::vector expected_ids({4, 5, 3, 8}); + std::vector expected_scores({0.6f, 0.5f, 0.9f, 0.7f}); + for (int i = 0; i < 4; i++) { + ASSERT_EQ(expected_ids[i], cpu_selected_ids.data()[i]); + ASSERT_EQ(expected_scores[i], cpu_selected_scores.data()[i]); + } + + delete place; + delete context; +} + +TEST(BeamSearch, CPU) { + TestBeamSearch(); +} + +#ifdef PADDLE_WITH_CUDA +TEST(BeamSearch, GPU) { + TestBeamSearch(); +} +#endif diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index f15b37a1e3..aedb82da2f 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { auto* out_data = output->value().data(); for (size_t i = 0; i < ret_rows.size(); ++i) { - for (size_t j = 0; j < row_numel; ++j) { + for (size_t j = 0; j < static_cast(row_numel); ++j) { EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); } } diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index 73d83fa2e4..74892316e6 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) { auto* out_data = output_cpu.data(); for (size_t i = 0; i < ret_rows.size(); ++i) { - for (size_t j = 0; j < row_numel; ++j) { + for (size_t j = 0; j < static_cast(row_numel); ++j) { EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); } } diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index 5535523e79..cf6e89b3d9 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { cpu_in_grad.set_lod(in_grad.lod()); } - EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim); + EXPECT_EQ(in_grad.numel(), static_cast(lod[0].back() * second_dim)); EXPECT_EQ(in_grad.lod(), lod); if (paddle::platform::is_cpu_place(*place)) { diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 9f504d14a8..2ce8f141d3 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 @@ -30,6 +31,34 @@ namespace platform { mask = __ballot_sync(FULL_WARP_MASK, (predicate)) #endif +inline static int RoundToPowerOfTwo(int dim) { + if (dim > 512) { + return 1024; + } else if (dim > 256) { + return 512; + } else if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; + } +} + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + template __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, int delta, int width = 32) { diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 5ee5d37183..400a6d7bfa 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -221,13 +221,17 @@ size_t GpuMaxChunkSize() { void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), - "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); + "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync " + "(%p -> %p, length: %d)", + src, dst, static_cast(count)); } void GpuMemcpySync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind), - "cudaMemcpy failed in paddle::platform::GpuMemcpySync"); + "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> " + "%p, length: %d)", + src, dst, static_cast(count)); } void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0116eb10d4..3392903843 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3875,6 +3875,7 @@ def beam_search(pre_ids, beam_size, end_id, level=0, + is_accumulated=True, name=None): """ Beam search is a classical algorithm for selecting candidate words in a @@ -3887,14 +3888,17 @@ def beam_search(pre_ids, selects the top-K candidate word ids of current step from :attr:`ids` according to their :attr:`scores` for all source sentences, where K is :attr:`beam_size` and :attr:`ids, scores` are predicted results from the - computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are - the output of beam_search at previous step, they are needed for special use - to handle ended candidate translations. - - Note that the :attr:`scores` passed in should be accumulated scores, and - length penalty should be done with extra operators before calculating the - accumulated scores if needed, also suggest finding top-K before it and - using the top-K candidates following. + computation cell. If :attr:`ids` is not set, it will be calculated out + according to :attr:`scores`. Additionally, :attr:`pre_ids` and + :attr:`pre_scores` are the output of beam_search at previous step, they + are needed for special use to handle ended candidate translations. + + Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores` + passed in should be accumulated scores. Else, the :attr:`scores` are + considered as the straightforward scores and will be transformed to the + log field and accumulated the :attr:`pre_scores` in this operator. + Length penalty should be done with extra operators before calculating the + accumulated scores if needed. Please see the following demo for a fully beam search usage example: @@ -3924,6 +3928,8 @@ def beam_search(pre_ids, describes how these candidates belong to the prefix. The paths linking prefixes and selected candidates are organized and reserved in lod. + is_accumulated(bool, default True): Whether the input :attr:`score` is + accumulated scores. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -3952,8 +3958,12 @@ def beam_search(pre_ids, end_id=end_id) """ helper = LayerHelper('beam_search', **locals()) - score_type = scores.dtype - id_type = ids.dtype + score_type = pre_scores.dtype + id_type = pre_ids.dtype + + inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores} + if ids is not None: + inputs["ids"] = ids selected_scores = helper.create_variable_for_type_inference( dtype=score_type) @@ -3961,12 +3971,7 @@ def beam_search(pre_ids, helper.append_op( type='beam_search', - inputs={ - 'pre_ids': pre_ids, - 'pre_scores': pre_scores, - 'ids': ids, - 'scores': scores, - }, + inputs=inputs, outputs={ 'selected_ids': selected_ids, 'selected_scores': selected_scores, @@ -3976,6 +3981,7 @@ def beam_search(pre_ids, 'level': level, 'beam_size': beam_size, 'end_id': end_id, + 'is_accumulated': is_accumulated, }) return selected_ids, selected_scores From 4d9feb35b9f740cf12f32c6353f92a2d31c5df67 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 24 Jan 2019 22:14:20 +0800 Subject: [PATCH 091/123] support multi grad ops test=develop --- paddle/fluid/imperative/layer.cc | 83 +++++++++-------- paddle/fluid/imperative/layer.h | 13 +-- paddle/fluid/imperative/tracer.cc | 90 ++++++++++--------- .../fluid/tests/unittests/test_imperative.py | 15 ++++ 4 files changed, 116 insertions(+), 85 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 8029129b9a..23a1f0f348 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -204,59 +204,68 @@ framework::LoDTensor& VarBase::GradValue() { } std::map> OpBase::ApplyGrad() { - if (!grad_op_desc_ && backward_id_ <= 0) { + if (grad_op_descs_.empty() && backward_id_ <= 0) { LOG(WARNING) << "op with no grad: " << op_desc_->Type(); return {}; } - std::map> grad_outputs; + std::vector grad_outputs; if (backward_id_ > 0) { + grad_outputs.resize(1); VLOG(3) << "py_layer_grad"; - grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( - backward_id_, - grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]); + grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = + PyLayer::ApplyGrad( + backward_id_, + grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); } else { - VLOG(3) << "op grad " << grad_op_desc_->Type(); - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - for (size_t i = 0; i < it.second.size(); ++i) { - // Allocate a new variable - Variable* tmp_var = new framework::Variable(); - tmp_var->GetMutable(); - outputs.push_back(tmp_var); + grad_outputs.resize(grad_op_descs_.size()); + for (size_t k = 0; k < grad_op_descs_.size(); ++k) { + framework::OpDesc* grad_op_desc = grad_op_descs_[k]; + VLOG(3) << "op grad " << grad_op_desc->Type(); + for (auto it : grad_output_vars_[k]) { + auto& outputs = grad_outputs[k][it.first]; + for (size_t i = 0; i < it.second.size(); ++i) { + // Allocate a new variable + Variable* tmp_var = new framework::Variable(); + tmp_var->GetMutable(); + outputs.push_back(tmp_var); + } } - } - framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); - // No need to do compile time infer shape here. - // grad_op_desc_->InferShape(*block_); - grad_op_desc_->InferVarType(block_); + // No need to do compile time infer shape here. + // grad_op_desc_->InferShape(*block_); + grad_op_desc->InferVarType(block_); - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc_); - framework::OperatorWithKernel* op_kernel = - dynamic_cast(opbase.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc); + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - framework::Scope scope; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); - p.op.RuntimeInferShape(scope, place_, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + framework::Scope scope; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); + p.op.RuntimeInferShape(scope, place_, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + } } - for (auto it : grad_output_vars_) { - auto& outputs = grad_outputs[it.first]; - auto& origin_outputs = it.second; - PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); - - for (size_t i = 0; i < outputs.size(); ++i) { - framework::Variable* grad = outputs[i]; - framework::Variable* orig_grad = origin_outputs[i]; - AddTo(grad, orig_grad, place_); - delete grad; + for (size_t k = 0; k < grad_output_vars_.size(); ++k) { + for (auto it : grad_output_vars_[k]) { + auto& outputs = grad_outputs[k][it.first]; + auto& origin_outputs = it.second; + PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); + + for (size_t i = 0; i < outputs.size(); ++i) { + framework::Variable* grad = outputs[i]; + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(grad, orig_grad, place_); + delete grad; + } } } + return input_vars_; } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 633924aa41..1f4c31b197 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -184,12 +184,13 @@ class OpBase { OpBase() : op_desc_(nullptr), forward_id_(-1), - grad_op_desc_(nullptr), backward_id_(-1), place_(platform::CPUPlace()) {} virtual ~OpBase() { - if (grad_op_desc_) delete grad_op_desc_; + for (framework::OpDesc* desc : grad_op_descs_) { + delete desc; + } } std::map> ApplyGrad(); @@ -198,9 +199,9 @@ class OpBase { // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. framework::OpDesc* op_desc_; int forward_id_; - // When has backward, one of `grad_op_desc_` or `backward_id_` is set, + // When has backward, one of `grad_op_descs_` or `backward_id_` is set, // not both. - framework::OpDesc* grad_op_desc_; + std::vector grad_op_descs_; int backward_id_; platform::Place place_; @@ -210,8 +211,8 @@ class OpBase { OpBasePtrMap pre_ops_; std::map> pre_ops_out_idx_; - framework::VariableValueMap grad_input_vars_; - framework::VariableValueMap grad_output_vars_; + std::vector grad_input_vars_; + std::vector grad_output_vars_; framework::BlockDesc* block_; }; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 5b87839f45..cd62807a55 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -24,15 +24,16 @@ namespace imperative { void CreateGradOp(const framework::OpDesc& op_desc, const std::unordered_set& no_grad_set, const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, + std::vector* grad_op_descs, std::unordered_map* grad_to_var) { - std::vector> grad_op_descs = + PADDLE_ENFORCE(grad_op_descs->empty()); + std::vector> descs = framework::OpInfoMap::Instance() .Get(op_desc.Type()) .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); - // TODO(panyx0718): Leak? - *grad_op_desc = grad_op_descs[0].release(); + for (auto& desc : descs) { + grad_op_descs->emplace_back(desc.release()); + } } void InitVar(framework::Variable* var, framework::Variable* grad_var, @@ -138,49 +139,52 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx)); if (!stop_gradient) { - framework::OpDesc* grad_op_desc; - // TODO(panyx): Is this leaked? std::unique_ptr> grad_to_var( new std::unordered_map()); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get()); - op->grad_op_desc_ = grad_op_desc; - - for (auto it : grad_op_desc->Inputs()) { - auto& grad_in_vars = op->grad_input_vars_[it.first]; - for (const std::string& grad_invar : it.second) { - block->FindRecursiveOrCreateVar(grad_invar); - auto var_it = grad_to_var->find(grad_invar); - if (var_it == grad_to_var->end()) { - auto fwd_var_it = vars.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != vars.end()); - // Forward inputs or outputs. - grad_in_vars.push_back(fwd_var_it->second->var_); - } else { + CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); + + op->grad_input_vars_.resize(op->grad_op_descs_.size()); + op->grad_output_vars_.resize(op->grad_op_descs_.size()); + for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { + framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; + for (auto it : grad_op_desc->Inputs()) { + auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + for (const std::string& grad_invar : it.second) { + block->FindRecursiveOrCreateVar(grad_invar); + auto var_it = grad_to_var->find(grad_invar); + if (var_it == grad_to_var->end()) { + auto fwd_var_it = vars.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != vars.end()); + // Forward inputs or outputs. + grad_in_vars.push_back(fwd_var_it->second->var_); + } else { + VarBase* var = vars[var_it->second]; + if (!var->grads_->var_->IsInitialized()) { + InitVar(var->var_, var->grads_->var_, + prepared_op.GetDeviceContext()); + } + // Douts. + grad_in_vars.push_back(var->grads_->var_); + } + } + } + + for (auto it : grad_op_desc->Outputs()) { + auto& grad_out_vars = op->grad_output_vars_[i][it.first]; + for (const std::string& grad_outvar : it.second) { + block->FindRecursiveOrCreateVar(grad_outvar); + auto var_it = grad_to_var->find(grad_outvar); + PADDLE_ENFORCE(var_it != grad_to_var->end(), + "Could not found the grad op output var, should this " + "operator %s's stop gradient be True", + op_desc->Type()); VarBase* var = vars[var_it->second]; if (!var->grads_->var_->IsInitialized()) { InitVar(var->var_, var->grads_->var_, prepared_op.GetDeviceContext()); } - // Douts. - grad_in_vars.push_back(var->grads_->var_); - } - } - } - - for (auto it : grad_op_desc->Outputs()) { - auto& grad_out_vars = op->grad_output_vars_[it.first]; - for (const std::string& grad_outvar : it.second) { - block->FindRecursiveOrCreateVar(grad_outvar); - auto var_it = grad_to_var->find(grad_outvar); - PADDLE_ENFORCE(var_it != grad_to_var->end(), - "Could not found the grad op output var, should this " - "operator %s's stop gradient be True", - op_desc->Type()); - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, prepared_op.GetDeviceContext()); + grad_out_vars.push_back(var->grads_->var_); } - grad_out_vars.push_back(var->grads_->var_); } } } @@ -209,10 +213,12 @@ std::vector Tracer::PyTrace(OpBase* op, out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); } if (!stop_gradient) { + op->grad_input_vars_.resize(1); + op->grad_output_vars_.resize(1); auto& grad_input_vars = - op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]; + op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]; auto& grad_output_vars = - op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)]; + op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)]; for (const VarBase* inp : inputs) { grad_input_vars.push_back(inp->var_); diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 7533ab9fdb..40f9b325fe 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -67,6 +67,21 @@ class MLP(fluid.imperative.Layer): class TestImperative(unittest.TestCase): + def test_sum_op(self): + with fluid.imperative.guard(): + inputs = [] + for _ in range(10): + inputs.append( + fluid.imperative.base.to_variable( + np.ones([2, 2], np.float32))) + sys.stderr.write('%s\n' % inputs[0].dtype) + ret = fluid.layers.sums(inputs) + sys.stderr.write('%s\n' % ret.dtype) + loss = fluid.layers.reduce_sum(ret) + sys.stderr.write('%s\n' % loss.dtype) + loss._backward() + sys.stderr.write('%s %s\n' % (ret._numpy(), inputs[0]._gradient())) + def test_layer(self): with fluid.imperative.guard(): cl = core.Layer() From b67584a6e9a5083a4e6c5c87b9ac4b68d48a3647 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 24 Jan 2019 17:14:58 +0000 Subject: [PATCH 092/123] jit benchmark use tensor test=develop --- paddle/fluid/operators/jit/CMakeLists.txt | 2 +- paddle/fluid/operators/jit/benchmark.cc | 107 ++++++++++++++-------- 2 files changed, 71 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 262094f922..35775d7ec9 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -21,5 +21,5 @@ endif() cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) if(NOT WIN32) - cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer) + cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor) endif() diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 74d6a87247..186c37c56e 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -18,6 +18,7 @@ #include #include "gflags/gflags.h" #include "glog/logging.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/place.h" @@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { LOG(INFO) << loginfos.str(); } +using Tensor = paddle::framework::Tensor; + template void BenchXYZNKernel() { for (int d : TestSizes()) { - std::vector x(d), y(d), z(d); - RandomVec(d, x.data()); - RandomVec(d, y.data()); - BenchAllImpls, PlaceType>(d, x.data(), y.data(), - z.data(), d); + Tensor x, y, z; + x.Resize({d}); + y.Resize({d}); + z.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + T* z_data = z.mutable_data(PlaceType()); + RandomVec(d, x_data); + RandomVec(d, y_data); + BenchAllImpls, PlaceType>(d, x.data(), + y.data(), z_data, d); } } @@ -170,9 +179,13 @@ template void BenchAXYNKernel() { for (int d : TestSizes()) { const T a = static_cast(3); - std::vector x(d), y(d); - RandomVec(d, x.data()); - BenchAllImpls, PlaceType>(d, &a, x.data(), y.data(), + Tensor x, y; + x.Resize({d}); + y.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + RandomVec(d, x_data); + BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, d); } } @@ -180,9 +193,13 @@ void BenchAXYNKernel() { template void BenchXYNKernel() { for (int d : TestSizes()) { - std::vector x(d), y(d); - RandomVec(d, x.data()); - BenchAllImpls, PlaceType>(d, x.data(), y.data(), d); + Tensor x, y; + x.Resize({d}); + y.Resize({d}); + T* x_data = x.mutable_data(PlaceType()); + T* y_data = y.mutable_data(PlaceType()); + RandomVec(d, x_data); + BenchAllImpls, PlaceType>(d, x.data(), y_data, d); } } @@ -192,16 +209,23 @@ void BenchLSTMKernel() { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, use_peephole); - std::vector x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d); - RandomVec(4 * d, x.data(), -2.f, 2.f); - RandomVec(3 * d, wp.data(), -2.f, 2.f); - RandomVec(d, ct_1.data(), -2.f, 2.f); - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - T* x_data = x.data(); - T* checked_data = checked.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); + Tensor x, ct_1, ct, ht, wp, checked; + x.Resize({4 * d}); + ct_1.Resize({d}); + ct.Resize({d}); + ht.Resize({d}); + wp.Resize({3 * d}); + checked.Resize({2 * d}); + auto place = PlaceType(); + RandomVec(x.numel(), x.mutable_data(place), -2.f, 2.f); + RandomVec(wp.numel(), wp.mutable_data(place), -2.f, 2.f); + RandomVec(ct_1.numel(), ct_1.mutable_data(place), -2.f, 2.f); + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + T* x_data = x.mutable_data(place); + T* checked_data = checked.mutable_data(place); + T* ct_data = ct.mutable_data(place); + T* ht_data = ht.mutable_data(place); jit::lstm_t step; step.gates = x_data; step.ct_1 = ct_1_data; @@ -220,12 +244,16 @@ template void BenchGRUKernel() { for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); - std::vector x(3 * d), ht_1(d), ht(d); - RandomVec(3 * d, x.data(), -2.f, 2.f); - RandomVec(d, ht_1.data(), -2.f, 2.f); - const T* ht_1_data = ht_1.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); + auto place = PlaceType(); + Tensor x, ht_1, ht; + x.Resize({3 * d}); + ht_1.Resize({d}); + ht.Resize({d}); + RandomVec(3 * d, x.mutable_data(place), -2.f, 2.f); + RandomVec(d, ht_1.mutable_data(place), -2.f, 2.f); + const T* ht_1_data = ht_1.data(); + T* x_data = x.mutable_data(place); + T* ht_data = ht.mutable_data(place); jit::gru_t step; step.gates = x_data; step.ht_1 = ht_1_data; @@ -243,10 +271,12 @@ void BenchSeqPoolKernel() { jit::seq_pool_attr_t attr(w, type); for (int h : TestSizes()) { attr.h = h; - std::vector x(h * w), y(w); - RandomVec(h * w, x.data(), -2.f, 2.f); - const T* x_data = x.data(); - T* y_data = y.data(); + Tensor x, y; + x.Resize({h * w}); + y.Resize({w}); + RandomVec(h * w, x.mutable_data(PlaceType()), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.mutable_data(PlaceType()); BenchAllImpls, PlaceType>(attr, x_data, y_data, &attr); } @@ -259,12 +289,15 @@ void BenchMatMulKernel() { for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { for (int k : TestSizes()) { - std::vector a(m * k), b(k * n), c(m * n); - RandomVec(m * k, a.data(), -2.f, 2.f); - RandomVec(k * n, b.data(), -2.f, 2.f); - const T* a_data = a.data(); - const T* b_data = b.data(); - T* c_data = c.data(); + Tensor a, b, c; + a.Resize({m * k}); + b.Resize({k * n}); + c.Resize({m * n}); + RandomVec(m * k, a.mutable_data(PlaceType()), -2.f, 2.f); + RandomVec(k * n, b.mutable_data(PlaceType()), -2.f, 2.f); + const T* a_data = a.data(); + const T* b_data = b.data(); + T* c_data = c.mutable_data(PlaceType()); BenchAllImpls, PlaceType>(k, a_data, b_data, c_data, m, n, k); } From f8f91fb4b3b404bfc3c32072cf61fe4f25d349b7 Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 24 Jan 2019 19:42:13 -0600 Subject: [PATCH 093/123] Revert conv transpose cudnn (#15514) * Revert "set constant for loss" This reverts commit 167933f678ccbb3563e949710279efe004a27731. * Revert "remove workspace_handle" test=develop This reverts commit b4aca8ede9e685bce1dfb1c59e63919f33432572. --- paddle/fluid/operators/conv_fusion_op.cu.cc | 65 ++++++++----------- .../operators/conv_transpose_cudnn_op.cu.cc | 57 ++++++++-------- .../fused/fusion_conv_inception_op.cu | 23 ++++--- paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 24 ++++--- 4 files changed, 76 insertions(+), 93 deletions(-) diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index f97ebecfdd..d8b997cca6 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; auto handle = dev_ctx.cudnn_handle(); - - Tensor cudnn_workspace; - void* cudnn_workspace_ptr = nullptr; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); @@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); VLOG(3) << "cuDNN forward algo " << algo; } else { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_limit)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - auto search_func = [&]() { int returned_algo_count; std::array fwd_perf_stat; - - CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, - kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(), - cudnn_workspace_ptr, workspace_size_limit)); - + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { const auto& stat = fwd_perf_stat[i]; @@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - if (!cudnn_workspace_ptr) { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_in_bytes)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - } - if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. @@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // cudnnConvolutionForward and cudnnAddTensor // ------------- cudnn conv forward and bias add --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; - - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, - workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); - + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); @@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------------- cudnn conv+bias+act forward -------------------- ScalingParamType alpha1 = 1.0f; ScalingParamType alpha2 = residual ? 1.0f : 0.0f; - - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, - output_data)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } std::vector channels = ctx.Attr>("split_channels"); if (channels.size()) { diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 016cf8448c..f44094ca6b 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int output_offset = output->numel() / output->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, output_data + output_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } }; @@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { output_grad->numel() / output_grad->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } @@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + filter_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } } } diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index c72a966c57..6e13887866 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { out_datas.push_back( static_cast(output_data + (oc0 + oc1 + oc2) * h * w)); - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size_in_bytes); - void* cudnn_workspace = temp_allocation->ptr(); - for (int i = 0; i < 4; ++i) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], - static_cast(filters[i]->data()), conv_desc[i], - algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, out_desc[i], - out_datas[i], bias_desc[i], - static_cast(bias[i]->data()), cudnn_act_desc, - out_desc[i], out_datas[i])); + auto func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], + static_cast(filters[i]->data()), conv_desc[i], + algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, + out_desc[i], out_datas[i], bias_desc[i], + static_cast(bias[i]->data()), cudnn_act_desc, + out_desc[i], out_datas[i])); + }; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + workspace_handle.RunFunc(func, workspace_size_in_bytes); } cudnnTensorDescriptor_t x_desc; diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc index 5e16a209e7..a764d59410 100644 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc @@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size)); T* loss_data = loss->mutable_data(loss_dims, ctx.GetPlace()); - math::SetConstant()( - ctx.template device_context(), loss, static_cast(0)); - - auto temp_allocation = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - workspace_size); - void* cudnn_workspace = temp_allocation->ptr(); - - CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( - handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, - warpctc_label_lengths.data(), warpctc_logits_lengths.data(), loss_data, - cu_grad_desc, warpctc_grad_data, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, - cu_ctcloss_desc, cudnn_workspace, workspace_size)); + + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( + handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, + warpctc_label_lengths.data(), warpctc_logits_lengths.data(), + loss_data, cu_grad_desc, warpctc_grad_data, + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace, + workspace_size)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size); } }; From 7166b52a6e211281887d43d67a7df2773d084de0 Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 24 Jan 2019 19:42:25 -0600 Subject: [PATCH 094/123] add limit_of_tmp_allocation for CI (#15513) test=develop --- cmake/generic.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 1f4dbe0b49..6679a09dfc 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -388,6 +388,7 @@ function(cc_test TARGET_NAME) endif() set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) # No unit test should exceed 10 minutes. set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) @@ -460,6 +461,7 @@ function(nv_test TARGET_NAME) endif() set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() endfunction(nv_test) @@ -708,9 +710,10 @@ function(py_test TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true - FLAGS_cpu_deterministic=true + FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) From efce25673c59294549107710f01d80a5ef211d82 Mon Sep 17 00:00:00 2001 From: baojun <32073718+baojun-nervana@users.noreply.github.com> Date: Thu, 24 Jan 2019 17:57:03 -0800 Subject: [PATCH 095/123] Adding ngraph_engine_op (#14948) * enable ngraph_engine_op test=develop * merge develop test=develop * avoid const_cast test=develop * rm ngraph_operator test=develop * Added TODO to move EnableNgraph test=develop * Add TODO to remove const_cast test=develop --- paddle/fluid/framework/CMakeLists.txt | 11 +- paddle/fluid/framework/executor.cc | 24 +- paddle/fluid/framework/ngraph_operator.h | 64 --- paddle/fluid/operators/CMakeLists.txt | 1 + paddle/fluid/operators/ngraph/CMakeLists.txt | 4 + .../fluid/operators/ngraph/ngraph_engine.cc | 492 ++++++++++++++++++ paddle/fluid/operators/ngraph/ngraph_engine.h | 93 ++++ .../operators/ngraph/ngraph_engine_op.cc | 52 ++ .../fluid/operators/ngraph/ngraph_engine_op.h | 58 +++ 9 files changed, 708 insertions(+), 91 deletions(-) delete mode 100644 paddle/fluid/framework/ngraph_operator.h create mode 100644 paddle/fluid/operators/ngraph/CMakeLists.txt create mode 100644 paddle/fluid/operators/ngraph/ngraph_engine.cc create mode 100644 paddle/fluid/operators/ngraph/ngraph_engine.h create mode 100644 paddle/fluid/operators/ngraph/ngraph_engine_op.cc create mode 100644 paddle/fluid/operators/ngraph/ngraph_engine_op.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a167511160..8cb0c4e668 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -131,8 +131,6 @@ cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc if(WITH_NGRAPH) cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) - cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler) endif(WITH_NGRAPH) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) @@ -171,13 +169,12 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - else() - if(WITH_NGRAPH) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) - else(WITH_NGRAPH) + if (WITH_NGRAPH) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine) + else () cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif(WITH_NGRAPH) + endif() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c93bbe7cee..4323883fa5 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_NGRAPH -#include "paddle/fluid/framework/ngraph_operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" #endif DECLARE_bool(benchmark); @@ -133,24 +133,6 @@ static void DeleteUnusedTensors( } } -static void EnableFusedOp(ExecutorPrepareContext* ctx) { -#ifdef PADDLE_WITH_NGRAPH - VLOG(3) << "use_ngraph=True"; - auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_); - for (auto& interval : intervals) { - auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0), - interval.at(1)); - *interval[0] = std::unique_ptr(ng_op); - } - for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { - ctx->ops_.erase(it->at(0) + 1, it->at(1)); - } -#else - LOG(WARNING) - << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; -#endif -} - Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); +#ifdef PADDLE_WITH_NGRAPH + if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc); +#endif auto ctx = Prepare(pdesc, block_id); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -379,7 +364,6 @@ std::unique_ptr Executor::Prepare( for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } - if (FLAGS_use_ngraph) EnableFusedOp(ctx.get()); return ctx; } diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h deleted file mode 100644 index ede80f44be..0000000000 --- a/paddle/fluid/framework/ngraph_operator.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_kernel_type.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/variant.h" - -#include "ngraph/type/element_type.hpp" - -namespace paddle { -namespace framework { - -class NgraphOperator : public OperatorBase { - public: - static std::vector< - std::vector>::iterator>> - NgraphOpIntervals( - std::vector>* ops); - - explicit NgraphOperator( - const ProgramDesc& prog, size_t block_id, - std::vector>::iterator start, - std::vector>::iterator end, - const std::string& type = "fused_op", const VariableNameMap& inputs = {}, - const VariableNameMap& outputs = {}, const AttributeMap& attrs = {}); - - void RunImpl(const Scope& scope, const platform::Place& place) const final; - - private: - const ProgramDesc pdesc_; - size_t block_; - std::vector> fused_ops_; - std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; - std::unordered_set post_op_inputs_; - bool is_full_ = false; - - void Process(); -}; -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 76419a2ea2..e099425b94 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -13,6 +13,7 @@ add_subdirectory(detection) add_subdirectory(elementwise) add_subdirectory(fused) add_subdirectory(metrics) +add_subdirectory(ngraph) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt new file mode 100644 index 0000000000..83f78d505d --- /dev/null +++ b/paddle/fluid/operators/ngraph/CMakeLists.txt @@ -0,0 +1,4 @@ +if(WITH_NGRAPH) + cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto) + op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context) +endif() diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc new file mode 100644 index 0000000000..fde3a5ba55 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc @@ -0,0 +1,492 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/ngraph_bridge.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" + +namespace paddle { +namespace operators { + +static ngraph::Shape Ddim2Shape(const framework::DDim& dims) { + ngraph::Shape sp; + for (int i = 0; i < dims.size(); ++i) { + int k = dims[i]; + k = k == 0 ? 1 : k; + sp.push_back(k); + } + return sp; +} + +static std::map + pd2ng_type_map = { + {framework::proto::VarType::FP32, ngraph::element::f32}, + {framework::proto::VarType::FP64, ngraph::element::f64}, + {framework::proto::VarType::INT32, ngraph::element::i32}, + {framework::proto::VarType::INT64, ngraph::element::i64}, + {framework::proto::VarType::BOOL, ngraph::element::boolean}, +}; + +std::unordered_map> + NgraphEngine::func_cache_ = {}; + +std::shared_ptr NgraphEngine::backend_ = + ngraph::runtime::Backend::create("CPU"); + +static std::vector> NgraphOpIntervals( + framework::BlockDesc* block) { + std::vector> intervals; + auto ops = block->AllOps(); + int size = ops.size(); + int left = 0; + while (left < size && ops.at(left)->Type() != framework::kFeedOpType) { + ++left; + } + if (left == size) { + return intervals; + } + while (left < size && ops.at(left)->Type() == framework::kFeedOpType) { + ++left; + } + + int right = left; + while (right < size && ops.at(right)->Type() != framework::kFetchOpType) { + ++right; + } + if (right == size) { + return intervals; + } + if (left >= right) return intervals; + + // (left, right - 1) represents indices between feed and fetch + int pivot = left; + while (pivot < right) { + auto op_type = ops.at(pivot)->Type(); + if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == + paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { + ++pivot; + } else { + int start = pivot, end = start; + while (pivot < right && + (paddle::framework::NgraphBridge::NG_NODE_MAP.find( + ops.at(pivot)->Type()) != + paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { + ++pivot; + ++end; + } + std::vector interval = {start, end}; + intervals.push_back(interval); + } + } // end while + return intervals; +} + +static void SubstituteNgraphOp(framework::BlockDesc* block, + std::string block_str, + std::vector interval) { + framework::ProgramDesc program; + block->RemoveOp(interval.at(0), interval.at(1)); + auto* ng_op = block->InsertOp(interval.at(0)); + ng_op->SetType("ngraph_engine"); + ng_op->SetAttr("interval", interval); + ng_op->SetAttr("graph", block_str); +} + +// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089 +void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) { +#ifdef PADDLE_WITH_NGRAPH + VLOG(4) << "use_ngraph=True"; + for (size_t bid = 0; bid < program.Size(); ++bid) { + // TODO(baojun-nervana): Remove the const_cast + auto* block = + const_cast(program).MutableBlock(bid); + std::string block_str = block->Proto()->SerializeAsString(); + auto intervals = NgraphOpIntervals(block); + for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { + SubstituteNgraphOp(block, block_str, *it); + } + } +#else + LOG(WARNING) + << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; +#endif +} + +NgraphEngine::NgraphEngine(const framework::Scope& scope, + const platform::Place& place, + const std::string& serialized_graph, + const std::vector& interval) + : scope_(scope), place_(place) { + var_in_node_map_ = std::make_shared< + std::unordered_map>>(); + + var_node_map_ = std::make_shared< + std::unordered_map>>(); + + func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) + + serialized_graph; + + framework::proto::BlockDesc bdesc; + bdesc.ParseFromString(serialized_graph); + framework::BlockDesc block(nullptr, &bdesc); + + Prepare(block, interval); + + BuildNgIO(); + + GetNgFunction(); +} + +void NgraphEngine::Prepare(const framework::BlockDesc& block, + const std::vector& interval) { + for (auto& var : block.AllVars()) { + if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || + var->GetType() == framework::proto::VarType::LOD_TENSOR || + var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) { + continue; + } + + auto var_name = var->Name(); + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var_name != framework::kFeedOpType && + var_name != framework::kFetchOpType) { + auto pd_type = var->GetDataType(); + if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { + PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", + var_name); + } + var_type_map_[var_name] = pd2ng_type_map[pd_type]; + } + + if (var->Persistable()) { + persistables_.insert(var->Name()); + } + } + + auto ops_desc = block.AllOps(); + int idx = interval[0]; + while (idx < interval[1]) { + auto op_desc = ops_desc.at(idx); + auto op = framework::OpRegistry::CreateOp(*op_desc); + fused_ops_.push_back(std::move(op)); + ++idx; + } + + while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { + auto op_desc = ops_desc.at(idx); + for (auto& var_name_item : op_desc->Inputs()) { + for (auto& var_name : var_name_item.second) { + post_op_inputs_.insert(var_name); + } + } + ++idx; + } + + while (idx < static_cast(ops_desc.size()) && + ops_desc.at(idx)->Type() == framework::kFetchOpType) { + std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0]; + fetches_.insert(fetch_target_name); + ++idx; + } + + if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType && + ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { + ng_op_state_ = OpState::FULL; + } + + for (auto* op_desc : ops_desc) { + if (op_desc->Type().find("_grad") != std::string::npos) { + ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN + : OpState::PARTIAL_TRAIN; + break; + } + } + + if (ng_op_state_ != OpState::FULL_TRAIN && + ng_op_state_ != OpState::PARTIAL_TRAIN) { + ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST + : OpState::PARTIAL_TEST; + } +} + +void NgraphEngine::GetNgInputShape( + std::shared_ptr op) { + framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); + op->RuntimeInferShape(scope_, place_, ctx); + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto sp = Ddim2Shape(tensor_pd->dims()); + if (std::find(var_in_.begin(), var_in_.end(), var_name) != + var_in_.end()) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var)); + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, sp, true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } + } + } + } + } +} + +void NgraphEngine::BuildNgNodes() { + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + for (auto& var_name : var_name_item.second) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + auto ng_shape = Ddim2Shape(ddim); + auto ng_type = var_type_map_.at(var_name); + auto prm = std::make_shared(ng_type, + ng_shape, true); + (*var_node_map_)[var_name] = prm; + } + } + } + } + } + framework::NgraphBridge ngb(var_node_map_); + for (auto& op : fused_ops_) { + ngb.BuildNgNode(op); + } +} + +void NgraphEngine::BuildNgIO() { + std::unordered_set inputs; + std::unordered_set outputs; + + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + inputs.insert(var_name); + const bool is_output = outputs.find(var_name) != outputs.end(); + if (!is_output && + std::find(var_in_.begin(), var_in_.end(), var_name) == + var_in_.end()) { + // fill var_in here to keep lhs and rhs order + var_in_.push_back(var_name); + } + } + } + + if (op->Type() != "fill_constant") { + GetNgInputShape(op); + } + + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + outputs.insert(var_name); + } + } + } + + // var_out.clear(); + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + switch (ng_op_state_) { + case OpState::PARTIAL_TEST: + if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || + fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::FULL_TEST: + if (fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::PARTIAL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + post_op_inputs_.find(var_name) != post_op_inputs_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + case OpState::FULL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + default: + var_out_.push_back(var_name); + } + } + } + } +} + +void NgraphEngine::BuildNgFunction() { + BuildNgNodes(); + ngraph_function_ = nullptr; + ngraph::NodeVector func_outputs; + ngraph::ParameterVector func_inputs; + + for (auto& vo : var_out_) { + func_outputs.push_back(var_node_map_->at(vo)); + } + + for (auto& vi : var_in_) { + std::shared_ptr prm = + std::dynamic_pointer_cast( + var_in_node_map_->at(vi)); + func_inputs.push_back(prm); + } + + ngraph_function_ = + std::make_shared(func_outputs, func_inputs); +} + +void NgraphEngine::GetNgFunction() { + bool cache_on = true; + if (cache_on) { + std::string input_shape_str; + for (auto& var_name : var_in_) { + auto shape = var_node_map_->at(var_name)->get_shape(); + for (size_t i = 0; i < shape.size(); ++i) { + input_shape_str += std::to_string(shape.at(i)); + } + } + func_cache_key_ = input_shape_str + func_cache_key_; + if (func_cache_.find(func_cache_key_) != func_cache_.end()) { + ngraph_function_ = func_cache_.at(func_cache_key_); + } else { + BuildNgFunction(); + func_cache_[func_cache_key_] = ngraph_function_; + } + } else { + BuildNgFunction(); + } +} + +void NgraphEngine::Run(const framework::Scope& scope, + const platform::Place& place) const { + std::vector> t_in; + std::vector> t_out; + + for (size_t i = 0; i < var_in_.size(); ++i) { + auto vi = var_in_.at(i); + auto sp = var_node_map_->at(vi)->get_shape(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), + "Ensure ngraph tensor layout align with paddle tensor"); + auto ng_type = var_type_map_.at(vi); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); + } else if (ng_type == ngraph::element::i32) { + const int* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i32, sp, + const_cast(arr)); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handling for var %s", vi); + } + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + } + bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST || + ng_op_state_ == OpState::FULL_TEST) + ? true + : false; + bool is_persistable = + (persistables_.find(vi) != persistables_.end()) ? true : false; + if (is_test && is_persistable) { + ti->set_stale(false); + } + t_in.push_back(ti); + } + + for (size_t i = 0; i < var_out_.size(); ++i) { + auto vo = var_out_[i]; + auto* var = scope.FindVar(vo); + std::shared_ptr to; + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + auto dd = tensor_pd->dims(); + ngraph::Shape sp = Ddim2Shape(dd); + auto ng_type = var_type_map_.at(vo); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::i32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ng_type, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handled in for var %s", vo); + } + t_out.push_back(to); + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vo); + } + } + + backend_->call(backend_->compile(ngraph_function_), t_out, t_in); +} // NgraphEngine::Run +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h new file mode 100644 index 0000000000..bf5ff2a743 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace operators { + +enum class OpState { /* nGraph support state on ops */ + FULL_TRAIN, /* Support full ops for train */ + PARTIAL_TRAIN, /* Support partial ops for train */ + FULL_TEST, /* Support full list of ops for test */ + PARTIAL_TEST, /* Support partial list of ops for test */ + FULL, /* All ops supported from feed to fetch */ + UNKNOWN /* Output all for debug purpose */ +}; + +// perform graph build through bridge and execute computation +class NgraphEngine { + public: + explicit NgraphEngine(const framework::Scope& scope, + const platform::Place& place, + const std::string& serialized_graph, + const std::vector& interval); + + void Run(const framework::Scope& scope, const platform::Place& place) const; + + static void EnableNgraph(const framework::ProgramDesc& program); + + private: + static std::unordered_map> + func_cache_; + const framework::Scope& scope_; + const platform::Place& place_; + std::vector> fused_ops_; + std::unordered_map var_type_map_; + std::unordered_set persistables_; + std::unordered_set fetches_; + std::unordered_set post_op_inputs_; + OpState ng_op_state_ = OpState::UNKNOWN; + std::string func_cache_key_; + + // ngraph backend eg. CPU + static std::shared_ptr backend_; + // ngraph function to call and execute + std::shared_ptr ngraph_function_; + // var_name of inputs + std::vector var_in_; + // var_name of outputs from fetch in order + std::vector var_out_; + // map input vars to nodes + std::shared_ptr< + std::unordered_map>> + var_in_node_map_; + // map each var name with a ngraph node + std::shared_ptr< + std::unordered_map>> + var_node_map_; + // prepare info for nraph engine + void Prepare(const framework::BlockDesc& block, + const std::vector& interval); + // get ngraph input and define ngraph input parameters + void GetNgInputShape(std::shared_ptr op); + // Call ngraph bridge to map ops + void BuildNgNodes(); + // get the ngraph input and output var list + void BuildNgIO(); + // build ngraph function call + void BuildNgFunction(); + // Check cache for ngraph function or otherwise build the function + void GetNgFunction(); +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc new file mode 100644 index 0000000000..3051ca123b --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h" + +namespace paddle { +namespace operators { + +class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Xs", "A list of inputs.").AsDispensable(); + AddOutput("Ys", "A list of outputs").AsDispensable(); + AddAttr("graph", "the graph."); + AddAttr>("interval", "op interval supported by ngraph"); + AddComment("ngraph engine operator."); + } +}; + +class NgraphEngineInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker, + ops::NgraphEngineOpMaker); +REGISTER_OP_CPU_KERNEL( + ngraph_engine, + ops::NgraphEngineKernel); diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h new file mode 100644 index 0000000000..d2974298b0 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +class NgraphEngineOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::proto::VarType::FP32, ctx.GetPlace()); + return kt; + } +}; + +template +class NgraphEngineKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& scope = ctx.scope(); + auto place = ctx.GetPlace(); + std::string serialized_graph = ctx.Attr("graph"); + auto interval = ctx.Attr>("interval"); + + NgraphEngine ngraph_engine(scope, place, serialized_graph, interval); + ngraph_engine.Run(scope, place); + } +}; + +} // namespace operators +} // namespace paddle From 42e61af861a27d5186e2518ff444b08ab5b572db Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 25 Jan 2019 10:07:17 +0800 Subject: [PATCH 096/123] polish test=develop --- paddle/fluid/imperative/layer.cc | 2 +- paddle/fluid/imperative/layer.h | 5 +++++ .../paddle/fluid/tests/unittests/test_imperative.py | 11 ++++------- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 23a1f0f348..83fc6ee2e2 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -211,8 +211,8 @@ std::map> OpBase::ApplyGrad() { std::vector grad_outputs; if (backward_id_ > 0) { - grad_outputs.resize(1); VLOG(3) << "py_layer_grad"; + grad_outputs.resize(1); grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( backward_id_, diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 1f4c31b197..dc97433a51 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -199,8 +199,10 @@ class OpBase { // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. framework::OpDesc* op_desc_; int forward_id_; + // When has backward, one of `grad_op_descs_` or `backward_id_` is set, // not both. + // Note: each fwd op corresponds to a vector of bwd ops. std::vector grad_op_descs_; int backward_id_; @@ -211,8 +213,11 @@ class OpBase { OpBasePtrMap pre_ops_; std::map> pre_ops_out_idx_; + // Inputs to a vector of bwd ops. std::vector grad_input_vars_; + // Outputs to a vector of bwd ops. std::vector grad_output_vars_; + framework::BlockDesc* block_; }; diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 40f9b325fe..adf35c851b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -68,19 +68,16 @@ class MLP(fluid.imperative.Layer): class TestImperative(unittest.TestCase): def test_sum_op(self): + x = np.ones([2, 2], np.float32) with fluid.imperative.guard(): inputs = [] for _ in range(10): - inputs.append( - fluid.imperative.base.to_variable( - np.ones([2, 2], np.float32))) - sys.stderr.write('%s\n' % inputs[0].dtype) + inputs.append(fluid.imperative.base.to_variable(x)) ret = fluid.layers.sums(inputs) - sys.stderr.write('%s\n' % ret.dtype) loss = fluid.layers.reduce_sum(ret) - sys.stderr.write('%s\n' % loss.dtype) loss._backward() - sys.stderr.write('%s %s\n' % (ret._numpy(), inputs[0]._gradient())) + self.assertTrue(np.allclose(ret._numpy(), x * 10)) + self.assertTrue(np.allclose(inputs[0]._gradient(), x)) def test_layer(self): with fluid.imperative.guard(): From 981fc2bdba09fca125e154426eb04e69aa3c3d2d Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 25 Jan 2019 11:03:27 +0800 Subject: [PATCH 097/123] fix bug in merge_ids (#15503) * fix mistakes in merge_ids, test=develop --- .../operators/distributed_ops/merge_ids_op.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h index 99c5759019..05c00251b9 100644 --- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h +++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.h @@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(ids.size(), outs.size(), "the number of Ids and Out should be the same"); - size_t row_ids_size = 0; - int row_size = 0; - int embedding_size = 0; + int64_t row_ids_size = 0; + int64_t row_size = 0; + int64_t embedding_size = 0; for (size_t i = 0; i < x_tensors.size(); ++i) { const auto *x_tensor = x_tensors[i]; @@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel { for (size_t i = 0; i < x_tensors.size(); ++i) { const auto *row_id = row_ids[i]; - for (int j = 0; j < row_id->numel(); ++j) { + for (auto j = 0; j < row_id->numel(); ++j) { int64_t key = row_id->data()[j]; std::tuple val = std::make_tuple(i, j); selected_rows_idx_map.insert(std::make_pair(key, val)); @@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel { out->set_lod(out_ids->lod()); - int nums = static_cast(out_ids->dims()[0]); + auto nums = out_ids->dims()[0]; auto *out_data = out->mutable_data( framework::make_ddim({nums, embedding_size}), place); - for (int j = 0; j < nums; ++j) { - int id = out_ids->data()[j]; - auto row_tuple = selected_rows_idx_map[id]; - int64_t row_idx = std::get<1>(row_tuple); + for (auto j = 0; j < nums; ++j) { + auto id = out_ids->data()[j]; + auto row_tuple = selected_rows_idx_map.at(id); + auto row_idx = std::get<1>(row_tuple); const auto *x_tensor = x_tensors[std::get<0>(row_tuple)]; memcpy(out_data + embedding_size * j, From fe8f28c957a1cf44bbb92cf7752de3cfbc6c7cc8 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 25 Jan 2019 11:14:29 +0800 Subject: [PATCH 098/123] Add GetVariableNoBarrier on brpc. (#15488) --- .../operators/distributed/CMakeLists.txt | 8 ++- .../operators/distributed/brpc/brpc_client.cc | 46 +++++++++---- .../operators/distributed/brpc/brpc_client.h | 9 +++ .../operators/distributed/brpc/brpc_server.cc | 65 +++++++++++++++++-- paddle/scripts/paddle_build.sh | 3 +- .../fluid/transpiler/details/checkport.py | 4 +- 6 files changed, 111 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index cb492f9995..fc28fe818d 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -20,7 +20,7 @@ if(WITH_GRPC) collective_client.cc collective_server.cc ${GRPC_SRCS} PROTO send_recv.proto - DEPS lod_tensor selected_rows_functor memory) + DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) @@ -32,15 +32,17 @@ else() set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib) + brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc request_handler_impl.cc rpc_client.cc rpc_server.cc variable_response.cc collective_client.cc collective_server.cc ${BRPC_SRCS} PROTO send_recv.proto - DEPS lod_tensor selected_rows memory) + DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS}) - set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib) + set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS}) cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL) endif() diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc index 87bdb83503..b8e63f42e2 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc @@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "SendRPC"; + const std::string method = kSendRPC; VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); framework::AsyncIO([=] { @@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, const std::string& method_name, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; + const std::string out_varname_val = out_var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "GetRPC"; - VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + const std::string method = kGetRPC; + VarHandlePtr var_h( + new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); framework::AsyncIO([=] { auto ch_ctx = ch_ptr->Pop(); @@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); req.set_trainer_id(trainer_id_); google::protobuf::Closure* done = brpc::NewCallback( @@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, platform::RecordRPCEvent record_event(method, p_ctx); - if (method_name == "GetMonomerVariable") { + if (method_name == kGetMonomerRPC) { ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); + } else if (method_name == kGetNoBarrierRPC) { + ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done); } else { ch_ctx->stub->GetVariable(cntl, &req, response, done); } @@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, return var_h; } +VarHandlePtr BRPCClient::AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out) { + std::string var_name_no_barrier = + string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); + + return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name, + kGetNoBarrierRPC, time_out); +} + VarHandlePtr BRPCClient::AsyncGetMonomerVariable( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out); + return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC, + time_out); } VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, const std::string& var_name, int64_t time_out) { - return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out); + return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out); } VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out); + return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, + time_out); } VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, @@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); - const std::string method = "PrefetchRPC"; + const std::string method = kPrefetchRPC; VarHandlePtr var_h( new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); @@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { - return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE, + return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE, time_out); } @@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); - const std::string method = "FetchBarrierRPC"; + const std::string method = kFetchBarrierRPC; // var handle VarHandlePtr var_h( new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); @@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { - return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out); + return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out); } void BRPCClient::SendComplete() { @@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage( google::protobuf::Closure* done = brpc::NewCallback( &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - if (method_name == "CheckPointNotifyRPC") { + if (method_name == kCheckPointNotifyRPC) { ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); - } else if (method_name == "GetMonomerBarrier") { + } else if (method_name == kSendMonomerFetchBarrierRPC) { ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); } else { ch_ctx->stub->SendVariable(cntl, &req, response, done); diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h index 2066ade8a5..501a593b11 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h @@ -65,6 +65,7 @@ class BRPCClient : public RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncGetMonomerBarrier( @@ -76,6 +77,13 @@ class BRPCClient : public RPCClient { const framework::Scope& scope, const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline); + VarHandlePtr AsyncPrefetchVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, @@ -103,6 +111,7 @@ class BRPCClient : public RPCClient { const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, const std::string& method_name, int64_t time_out = FLAGS_rpc_deadline); diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc index cbe0bd09c7..fea9b09414 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc @@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService { rpc_server_->GetThreadNum(distributed::kRequestGet))); } + it = rpc_call_map.find(distributed::kRequestGetNoBarrier); + if (it != rpc_call_map.end()) { + request_getnobarrier_h_ = it->second; + getnobarrier_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier))); + } + it = rpc_call_map.find(distributed::kRequestPrefetch); if (it != rpc_call_map.end()) { request_prefetch_h_ = it->second; @@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService { [=] { _GetVariable(cntl_butil, request, response, done); }); } + void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + getnobarrier_threads_->Run( + [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); }); + } + void _GetVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) { @@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService { brpc::Controller* cntl = static_cast(cntl_butil); std::string varname = request->varname(); + std::string out_varname = request->out_varname(); VLOG(3) << "RequestGet varname:" << varname + << ", out_varname:" << out_varname << ", trainer_id:" << request->trainer_id() << ", from:" << cntl->remote_side(); auto scope = request_get_h_->scope(); - auto invar = scope->FindVar(varname); + paddle::framework::Variable* invar = nullptr; + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); + + if (outvar) { + distributed::SerializeToIOBuf(out_varname, outvar, + *request_get_h_->dev_ctx(), response, + &cntl->response_attachment(), "", false); + } + } + + void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr, + "RequestGetNoBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + std::string out_varname = request->out_varname(); int trainer_id = request->trainer_id(); + + VLOG(3) << "RequestGetNoBarrier varname:" << varname + << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id + << ", from:" << cntl->remote_side(); + + auto scope = request_getnobarrier_h_->scope(); + paddle::framework::Variable* invar = nullptr; paddle::framework::Variable* outvar = nullptr; - request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id); + request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); if (outvar) { - distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(), - response, &cntl->response_attachment(), "", - false); + distributed::SerializeToIOBuf( + out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response, + &cntl->response_attachment(), "", false); } } + void PrefetchVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, @@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService { private: distributed::RequestHandler* request_send_h_{nullptr}; distributed::RequestHandler* request_get_h_{nullptr}; + distributed::RequestHandler* request_getnobarrier_h_{nullptr}; distributed::RequestHandler* request_prefetch_h_{nullptr}; distributed::RequestHandler* request_checkpoint_h_{nullptr}; distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; @@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService { distributed::RPCServer* rpc_server_{nullptr}; - // FIXME(gongwb): brpc should support process one rpce use one threadpool. + // FIXME(gongwb): brpc should support process one rpc use one threadpool. std::unique_ptr send_threads_; std::unique_ptr get_threads_; + std::unique_ptr getnobarrier_threads_; std::unique_ptr prefetch_threads_; std::unique_ptr checkpoint_notify_threads_; }; diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index bb7258ee59..c2156a436e 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -328,7 +328,8 @@ function run_brpc_test() { ======================================== EOF set +x - declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test") + declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \ + "rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test") all_tests=`ctest -N` for t in "${other_tests[@]}" diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py index 6b78ceeaee..89dd4dd50b 100644 --- a/python/paddle/fluid/transpiler/details/checkport.py +++ b/python/paddle/fluid/transpiler/details/checkport.py @@ -16,6 +16,7 @@ import sys import time import socket from contextlib import closing +from six import string_types def wait_server_ready(endpoints): @@ -32,6 +33,7 @@ def wait_server_ready(endpoints): wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) """ + assert not isinstance(endpoints, string_types) while True: all_ok = True not_ready_endpoints = [] @@ -45,7 +47,7 @@ def wait_server_ready(endpoints): all_ok = False not_ready_endpoints.append(ep) if not all_ok: - sys.stderr.write("pserver not ready, wait 3 sec to retry...\n") + sys.stderr.write("server not ready, wait 3 sec to retry...\n") sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) + "\n") sys.stderr.flush() From c750be6d9dc601f0c22a255a86166749da64e026 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 25 Jan 2019 11:58:14 +0800 Subject: [PATCH 099/123] add some log --- .../fluid/operators/distributed/rpc_server.cc | 25 ++++++++++--------- .../distributed_ops/listen_and_serv_op.cc | 8 +++++- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index cc5b9c29a1..baf6b73b0d 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -39,10 +39,11 @@ void RPCServer::SavePort() const { port_file.open(file_path); port_file << selected_port_; port_file.close(); - VLOG(4) << "selected port written to " << file_path; + VLOG(3) << "selected port written to " << file_path; } void RPCServer::WaitBarrier(const std::string& rpc_name) { + VLOG(3) << "WaitBarrier: " << rpc_name; std::unique_lock lock(this->mutex_); barrier_cond_.wait(lock, [this, &rpc_name] { return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || @@ -54,7 +55,7 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) { } void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; int b = 0; std::unique_lock lock(mutex_); b = ++barrier_counter_[rpc_name]; @@ -71,7 +72,7 @@ void RPCServer::Complete() { client_num_--; need_reset_all_vars_ = true; - VLOG(4) << "decrease client_num to: " << client_num_; + VLOG(3) << "decrease client_num to: " << client_num_; if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { barrier_counter_[kRequestGet]--; } @@ -105,7 +106,7 @@ void RPCServer::RegisterRPC(const std::string& rpc_name, static int cond = -1; rpc_cond_map_[rpc_name] = ++cond; - VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler + VLOG(3) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler << ", cond:" << rpc_cond_map_[rpc_name]; } @@ -120,7 +121,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { } void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(4) << "RPCServer WaitCond " << rpc_name; + VLOG(3) << "RPCServer WaitCond " << rpc_name; int cond = 0; { std::unique_lock lock(mutex_); @@ -151,7 +152,7 @@ void RPCServer::RegisterVar(const std::string& var_name, } rpc_cond_.notify_all(); - VLOG(4) << "RegisterVar context:" << h.String(); + VLOG(3) << "RegisterVar context:" << h.String(); } void RPCServer::IncreaseVarBarrier(const std::string& var_name) { @@ -167,11 +168,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) { barrier_cond_.notify_all(); } - VLOG(4) << "IncreaseVarBarrier context:" << h.String(); + VLOG(3) << "IncreaseVarBarrier context:" << h.String(); } void RPCServer::WaitVarBarrier(const std::string& var_name) { - VLOG(4) << "WaitBarrier var_name:" << var_name; + VLOG(3) << "WaitVarBarrier var_name:" << var_name; std::unique_lock lock(mutex_); barrier_cond_.wait(lock, [&]() { @@ -179,11 +180,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) { exit_flag_.load()); }); - VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String(); + VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); } void RPCServer::SetVarCond(const std::string& var_name) { - VLOG(4) << "SetVarCond var_name:" << var_name; + VLOG(3) << "SetVarCond var_name:" << var_name; { std::unique_lock lock(mutex_); if (var_map_.find(var_name) != var_map_.end()) { @@ -193,14 +194,14 @@ void RPCServer::SetVarCond(const std::string& var_name) { } void RPCServer::WaitVarCond(const std::string& var_name) { - VLOG(4) << "WaitVarCond var_name:" << var_name; + VLOG(3) << "WaitVarCond var_name:" << var_name; std::unique_lock lock(mutex_); rpc_cond_.wait(lock, [=] { return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); }); - VLOG(4) << "WaitVarCond var_name:" << var_name << " end"; + VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; } MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 53968831ea..5b30ed472d 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop( while (true) { // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. + VLOG(3) << "wait all clients to send gradient"; rpc_service_->SetCond(distributed::kRequestSend); + VLOG(3) << "wait all clients to send send_barrier"; rpc_service_->WaitBarrier(distributed::kRequestSend); if (rpc_service_->IsExit()) { @@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop( } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(3) << "ResetReceivedVars"; ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); + VLOG(3) << "wait all clients to get parameters back"; rpc_service_->SetCond(distributed::kRequestGet); + VLOG(3) << "wait all clients to send fetch_barrier"; rpc_service_->WaitBarrier(distributed::kRequestGet); + VLOG(3) << "ResetBarrierCounter"; rpc_service_->ResetBarrierCounter(); } // while(true) } From c52f57de5b12a1733fc786ea4682e6f0f8089a67 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 25 Jan 2019 03:58:25 +0000 Subject: [PATCH 100/123] test=develop, refine_error_message for data type --- paddle/fluid/framework/operator.cc | 15 +++++++++------ paddle/fluid/framework/tensor_impl.h | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ee9f6a4805..ab3cf308fc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1073,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData( proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { - int data_type = -1; + proto::VarType::Type dafault_data_type = + static_cast(-1); + proto::VarType::Type data_type = dafault_data_type; for (auto& input : this->inputs_) { const std::vector vars = ctx.MultiInputVar(input.first); for (size_t i = 0; i < vars.size(); ++i) { @@ -1090,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( if (t != nullptr) { PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", input.first, i); - int tmp = static_cast(t->type()); + proto::VarType::Type tmp = t->type(); PADDLE_ENFORCE( - tmp == data_type || data_type == -1, + tmp == data_type || data_type == dafault_data_type, "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", - Type(), data_type, tmp); + Type(), DataTypeToString(data_type), DataTypeToString(tmp)); data_type = tmp; } } } } - PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input"); - return static_cast(data_type); + PADDLE_ENFORCE(data_type != dafault_data_type, + "DataType should be indicated by input"); + return data_type; } OpKernelType OperatorWithKernel::GetExpectedKernelType( diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index ce3ad18b1f..ef5404e475 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -25,7 +25,8 @@ inline const T* Tensor::data() const { check_memory_size(); bool valid = std::is_same::value || type_ == DataTypeTrait::DataType; - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", + DataTypeToString(type_)); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); From 36abc964dff01156119be4c87282a7142ee1998c Mon Sep 17 00:00:00 2001 From: nhzlx Date: Fri, 25 Jan 2019 04:02:01 +0000 Subject: [PATCH 101/123] fix pybind problem: add an enum to AnalysisConfig test=develop --- paddle/fluid/pybind/inference_api.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 2624702666..e05667d2c7 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) { } void BindAnalysisConfig(py::module *m) { - py::class_(*m, "AnalysisConfig") - .def(py::init()) + py::class_ analysis_config(*m, "AnalysisConfig"); + + py::enum_(analysis_config, "Precision") + .value("Float32", AnalysisConfig::Precision::kFloat32) + .value("Int8", AnalysisConfig::Precision::kInt8) + .export_values(); + + analysis_config.def(py::init()) .def(py::init()) .def(py::init()) .def("set_model", (void (AnalysisConfig::*)(const std::string &)) & @@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) { .def("specify_input_name", &AnalysisConfig::specify_input_name) .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, - py::arg("min_subgraph_size") = 3) + py::arg("min_subgraph_size") = 3, + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) From 84220765a73e68c1a817fb1fd3c7806814a83e7e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 25 Jan 2019 14:48:58 +0800 Subject: [PATCH 102/123] refine code, add more log --- .../distributed/request_handler_impl.cc | 37 ++++++++++++------- .../fluid/operators/distributed/rpc_server.cc | 9 +++-- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 913ae76b38..2ed4683318 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -54,6 +54,11 @@ bool RequestSendHandler::Handle(const std::string& varname, // Async if (!sync_mode_) { VLOG(3) << "async process var: " << varname; + if (varname == BATCH_BARRIER_MESSAGE || varname == COMPLETE_MESSAGE) { + PADDLE_THROW( + "async mode should not recv BATCH_BARRIER_MESSAGE or " + "COMPLETE_MESSAGE"); + } try { executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), scope); @@ -95,21 +100,25 @@ bool RequestGetHandler::Handle(const std::string& varname, } } else { if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { - if (enable_dc_asgd_) { - // NOTE: the format is determined by distributed_transpiler.py - std::string param_bak_name = - string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); - VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; - auto var = scope_->FindVar(varname); - auto t_orig = var->Get(); - auto param_bak = scope_->Var(param_bak_name); - auto t = param_bak->GetMutable(); - t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); - VLOG(3) << "copying " << varname << " to " << param_bak_name; - framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); - } - *outvar = scope_->FindVar(varname); + PADDLE_THROW( + "async mode should not send FETCH_BARRIER_MESSAGE or " + "COMPLETE_MESSAGE"); + } + + if (enable_dc_asgd_) { + // NOTE: the format is determined by distributed_transpiler.py + std::string param_bak_name = + string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); + VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; + auto var = scope_->FindVar(varname); + auto t_orig = var->Get(); + auto param_bak = scope_->Var(param_bak_name); + auto t = param_bak->GetMutable(); + t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); + VLOG(3) << "copying " << varname << " to " << param_bak_name; + framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); } + *outvar = scope_->FindVar(varname); } return true; } diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index baf6b73b0d..90733fd090 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -43,15 +43,15 @@ void RPCServer::SavePort() const { } void RPCServer::WaitBarrier(const std::string& rpc_name) { - VLOG(3) << "WaitBarrier: " << rpc_name; + VLOG(3) << "WaitBarrier in: " << rpc_name; std::unique_lock lock(this->mutex_); barrier_cond_.wait(lock, [this, &rpc_name] { return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || exit_flag_.load()); }); - VLOG(3) << "batch_barrier_: " << rpc_name << " " - << barrier_counter_[rpc_name]; + VLOG(3) << "WaitBarrier out: " << rpc_name + << " counter: " << barrier_counter_[rpc_name]; } void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { @@ -59,8 +59,11 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { int b = 0; std::unique_lock lock(mutex_); b = ++barrier_counter_[rpc_name]; + VLOG(3) << rpc_name << " barrier_counter: " << b; if (b >= client_num_) { lock.unlock(); + VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " + << rpc_name; barrier_cond_.notify_all(); lock.lock(); } From d54494ba87f5945aabcf63e70ac6b97489ce88e7 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 25 Jan 2019 15:03:14 +0800 Subject: [PATCH 103/123] cleanup test=develop (#15347) --- paddle/fluid/framework/details/execution_strategy.h | 5 ++++- paddle/fluid/operators/distributed/variable_response.cc | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 37b07e5736..2edb50e0d8 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,10 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{1}; + // If we set this to 1, we will delete all variables when finish a batch. and + // this will loss 15%+ performance. + // Please be aware about this parameters. + size_t num_iteration_per_drop_scope_{100}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 47ff568a11..7825b4fc82 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData( tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type())); VLOG(6) << "Tensor.memory_size = " << tensor->memory_size() - << ", Buffer Size = " << length; - PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast(length)); + << ", Buffer Size = " << length << ", dims:" << dims + << ", numel:" << tensor->numel(); + PADDLE_ENFORCE_GE(tensor->memory_size(), static_cast(length)); return ReadRaw(input, ctx, tensor->place(), tensor_data, length); } From 5a0c6593d543942f0acbb027a4c35f3576877410 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 25 Jan 2019 16:47:19 +0800 Subject: [PATCH 104/123] revert RequestGetHandler --- .../distributed/request_handler_impl.cc | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 2ed4683318..c609777f0c 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -100,25 +100,21 @@ bool RequestGetHandler::Handle(const std::string& varname, } } else { if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { - PADDLE_THROW( - "async mode should not send FETCH_BARRIER_MESSAGE or " - "COMPLETE_MESSAGE"); - } - - if (enable_dc_asgd_) { - // NOTE: the format is determined by distributed_transpiler.py - std::string param_bak_name = - string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); - VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; - auto var = scope_->FindVar(varname); - auto t_orig = var->Get(); - auto param_bak = scope_->Var(param_bak_name); - auto t = param_bak->GetMutable(); - t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); - VLOG(3) << "copying " << varname << " to " << param_bak_name; - framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); + if (enable_dc_asgd_) { + // NOTE: the format is determined by distributed_transpiler.py + std::string param_bak_name = + string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); + VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; + auto var = scope_->FindVar(varname); + auto t_orig = var->Get(); + auto param_bak = scope_->Var(param_bak_name); + auto t = param_bak->GetMutable(); + t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); + VLOG(3) << "copying " << varname << " to " << param_bak_name; + framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); + } + *outvar = scope_->FindVar(varname); } - *outvar = scope_->FindVar(varname); } return true; } From c42ef5bf0531dd28df1773de5e2b439643d5c590 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Fri, 25 Jan 2019 18:30:03 +0800 Subject: [PATCH 105/123] remove legacy WITH_DOC option test=develop --- CMakeLists.txt | 6 -- Dockerfile | 2 - cmake/FindSphinx.cmake | 147 --------------------------------- paddle/scripts/paddle_build.sh | 31 ------- 4 files changed, 186 deletions(-) delete mode 100644 cmake/FindSphinx.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ec632e206..e85fce5836 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -276,9 +276,3 @@ add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) endif() - -if(WITH_DOC) - find_package(Sphinx REQUIRED) - find_python_module(recommonmark REQUIRED) - add_subdirectory(doc) -endif() diff --git a/Dockerfile b/Dockerfile index acfd091265..fe0721e9b9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub # ENV variables ARG WITH_GPU ARG WITH_AVX -ARG WITH_DOC ENV WOBOQ OFF ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} -ENV WITH_DOC=${WITH_DOC:-OFF} ENV HOME /root # Add bash enhancements diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake deleted file mode 100644 index f74cd4ff8c..0000000000 --- a/cmake/FindSphinx.cmake +++ /dev/null @@ -1,147 +0,0 @@ -# - This module looks for Sphinx -# Find the Sphinx documentation generator -# -# This modules defines -# SPHINX_EXECUTABLE -# SPHINX_FOUND - -find_program(SPHINX_EXECUTABLE - NAMES sphinx-build - PATHS - /usr/bin - /usr/local/bin - /opt/local/bin - DOC "Sphinx documentation generator" -) - -if( NOT SPHINX_EXECUTABLE ) - set(_Python_VERSIONS - 2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5 - ) - - foreach( _version ${_Python_VERSIONS} ) - set( _sphinx_NAMES sphinx-build-${_version} ) - - find_program( SPHINX_EXECUTABLE - NAMES ${_sphinx_NAMES} - PATHS - /usr/bin - /usr/local/bin - /opt/loca/bin - DOC "Sphinx documentation generator" - ) - endforeach() -endif() - -include(FindPackageHandleStandardArgs) - -find_package_handle_standard_args(Sphinx DEFAULT_MSG - SPHINX_EXECUTABLE -) - - -option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON ) -option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF ) -option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF ) -option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF ) -option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF ) -option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF ) -option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF ) -option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF ) -option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF ) - - -mark_as_advanced( - SPHINX_EXECUTABLE - SPHINX_HTML_OUTPUT - SPHINX_DIRHTML_OUTPUT - SPHINX_HTMLHELP_OUTPUT - SPHINX_QTHELP_OUTPUT - SPHINX_DEVHELP_OUTPUT - SPHINX_EPUB_OUTPUT - SPHINX_LATEX_OUTPUT - SPHINX_MAN_OUTPUT - SPHINX_TEXT_OUTPUT -) - -function( Sphinx_add_target target_name builder conf cache source destination ) - add_custom_target( ${target_name} ALL - COMMAND ${SPHINX_EXECUTABLE} -b ${builder} - -d ${cache} - -c ${conf} - ${source} - ${destination} - COMMENT "Generating sphinx documentation: ${builder}" - COMMAND cd ${destination} && ln -sf ./index_*.html index.html - ) - - set_property( - DIRECTORY APPEND PROPERTY - ADDITIONAL_MAKE_CLEAN_FILES - ${destination} - ) -endfunction() - -# Target dependencies can be optionally listed at the end. -function( Sphinx_add_targets target_base_name conf source base_destination ) - - set( _dependencies ) - - foreach( arg IN LISTS ARGN ) - set( _dependencies ${_dependencies} ${arg} ) - endforeach() - - if( ${SPHINX_HTML_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html ) - - add_dependencies( ${target_base_name}_html ${_dependencies} ) - endif() - - if( ${SPHINX_DIRHTML_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml ) - - add_dependencies( ${target_base_name}_dirhtml ${_dependencies} ) - endif() - - if( ${SPHINX_QTHELP_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp ) - - add_dependencies( ${target_base_name}_qthelp ${_dependencies} ) - endif() - - if( ${SPHINX_DEVHELP_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp ) - - add_dependencies( ${target_base_name}_devhelp ${_dependencies} ) - endif() - - if( ${SPHINX_EPUB_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub ) - - add_dependencies( ${target_base_name}_epub ${_dependencies} ) - endif() - - if( ${SPHINX_LATEX_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex ) - - add_dependencies( ${target_base_name}_latex ${_dependencies} ) - endif() - - if( ${SPHINX_MAN_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man ) - - add_dependencies( ${target_base_name}_man ${_dependencies} ) - endif() - - if( ${SPHINX_TEXT_OUTPUT} ) - Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text ) - - add_dependencies( ${target_base_name}_text ${_dependencies} ) - endif() - - if( ${BUILD_TESTING} ) - sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck ) - - add_dependencies( ${target_base_name}_linkcheck ${_dependencies} ) - endif() -endfunction() diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index c2156a436e..1135caf4f8 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -173,7 +173,6 @@ function cmake_gen() { -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} ${PYTHON_FLAGS} -DWITH_DSO=ON - -DWITH_DOC=${WITH_DOC:-OFF} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} @@ -208,7 +207,6 @@ EOF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ ${PYTHON_FLAGS} \ -DWITH_DSO=ON \ - -DWITH_DOC=${WITH_DOC:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_DISTRIBUTE=${distibuted_flag} \ @@ -528,31 +526,6 @@ function bind_test() { wait } - -function gen_docs() { - mkdir -p ${PADDLE_ROOT}/build - cd ${PADDLE_ROOT}/build - cat < Date: Fri, 25 Jan 2019 10:57:25 +0000 Subject: [PATCH 106/123] fix comments test=develop --- paddle/fluid/inference/analysis/argument.h | 1 - paddle/fluid/inference/analysis/helper.h | 37 ++++++++++++++- .../inference/analysis/ir_pass_manager.cc | 15 ++++--- .../inference/analysis/ir_pass_manager.h | 1 + .../ir_passes/tensorrt_subgraph_pass.cc | 45 +++++++++---------- .../fluid/inference/api/analysis_predictor.cc | 17 ++++--- .../operators/tensorrt/tensorrt_engine_op.h | 3 +- 7 files changed, 82 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 1b703c35e2..f9018f8ddb 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -105,7 +105,6 @@ struct Argument { DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); - DECL_ARGUMENT_FIELD(model_path, ModelPath, std::string); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 728e0a9c0d..120f6ef27d 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include #include @@ -29,9 +30,14 @@ limitations under the License. */ #include "paddle/fluid/platform/port.h" #ifdef _WIN32 +#include +#include #define GCC_ATTRIBUTE(attr__) ; +#define MKDIR(path) _mkdir(path) #else +#include #define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); +#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) #endif #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) @@ -163,7 +169,7 @@ static bool PathExists(const std::string &path) { return false; } -static std::string GetDirRoot(const std::string path) { +static std::string GetDirRoot(const std::string &path) { char sep = '/'; #ifdef _WIN32 @@ -177,11 +183,40 @@ static std::string GetDirRoot(const std::string path) { return path; } +static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) { + std::string opt_cache_dir = model_root + "/_opt_cache/"; + if (!PathExists(opt_cache_dir)) { + PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1, + "Can not create optimize cache directory: %s, Make sure you " + "have permission to write", + opt_cache_dir); + } + return opt_cache_dir; +} + static std::string GetTrtCalibPath(const std::string &model_root, const std::string &engine_key) { return model_root + "/trt_calib_" + engine_key; } +// If there is no calib table data file in model_opt_cache_dir, return "". +static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, + const std::string &engine_key, + bool enable_int8) { + std::string trt_calib_table_path = + GetTrtCalibPath(model_opt_cache_dir, engine_key); + if (enable_int8 && FileExists(trt_calib_table_path)) { + VLOG(3) << "Calibration table file: " << trt_calib_table_path + << "is found here"; + std::ifstream infile(trt_calib_table_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string calibration_data(buffer.str()); + return calibration_data; + } + return ""; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 9aaae16144..403ebfe72a 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -72,14 +72,17 @@ void IRPassManager::CreatePasses(Argument *argument, new framework::ProgramDesc *( const_cast(&argument->main_program()))); - bool enable_int8 = false; - if (argument->tensorrt_precision_mode() == - contrib::AnalysisConfig::Precision::kInt8) { - enable_int8 = true; - } + bool enable_int8 = argument->tensorrt_precision_mode() == + contrib::AnalysisConfig::Precision::kInt8; pass->Set("enable_int8", new bool(enable_int8)); - pass->Set("model_dir", new std::string(argument->model_path())); + std::string model_opt_cache_dir = + argument->Has("model_dir") + ? argument->model_dir() + : GetDirRoot(argument->model_program_path()); + pass->Set( + "model_opt_cache_dir", + new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); } // graph_ = pass->Apply(std::move(graph_)); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index f378d35d9a..2a595cb36b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -29,6 +29,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/helper.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 67c7f7da92..69a9caec03 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -68,6 +68,19 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( return graph; } +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, Graph *graph) const { auto *op_desc = node->Op(); @@ -97,7 +110,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, *op->Proto() = *node->Op()->Proto(); } - // collect inputs + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the eigine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. std::set input_names; std::set input_names_with_id; for (auto *x : node->inputs) { @@ -217,30 +233,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); auto enable_int8 = Get("enable_int8"); - SetAttr(op_desc->Proto(), "calibration_data", std::string("")); + auto engine_key = + GenerateEngineKey(input_names_with_id, output_names_with_id); - // we use the subgraph's inputs and outputs to generate the engine key. - std::string engine_hash_key = ""; - for (auto name : input_names_with_id) { - engine_hash_key += name; - } - for (auto name : output_names_with_id) { - engine_hash_key += name; - } - - auto engine_key = std::to_string(std::hash()(engine_hash_key)); + std::string calibration_data = GetTrtCalibTableData( + Get("model_opt_cache_dir"), engine_key, enable_int8); + SetAttr(op_desc->Proto(), "calibration_data", calibration_data); - auto trt_calib_file = - GetTrtCalibPath(Get("model_dir"), engine_key); - VLOG(3) << "engine key: " << engine_key; - if (enable_int8 && FileExists(trt_calib_file)) { - VLOG(3) << "Calibration table file: " << trt_calib_file << "is found here"; - std::ifstream infile(trt_calib_file, std::ios::in); - std::stringstream buffer; - buffer << infile.rdbuf(); - std::string calibration_data(buffer.str()); - SetAttr(op_desc->Proto(), "calibration_data", calibration_data); - } SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 250de65b32..9dff9363aa 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -40,6 +40,7 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" + #endif DECLARE_bool(profile); @@ -341,7 +342,6 @@ void AnalysisPredictor::OptimizeInferenceProgram() { // Analyze inference_program if (!config_.model_dir().empty()) { argument_.SetModelDir(config_.model_dir()); - argument_.SetModelPath(config_.model_dir()); } else { PADDLE_ENFORCE( !config_.params_file().empty(), @@ -349,7 +349,6 @@ void AnalysisPredictor::OptimizeInferenceProgram() { PADDLE_ENFORCE(!config_.prog_file().empty()); std::string dir = inference::analysis::GetDirRoot(config_.prog_file()); - argument_.SetModelPath(dir); argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelParamsPath(config_.params_file()); } @@ -599,7 +598,8 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { Singleton::Global().Get(engine_name); LOG(INFO) << "Wait for calib threads done."; calib_engine->calib_->waitAndSetDone(); - LOG(INFO) << "Finish wait."; + LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot " + "of time..."; calib_engine->thr_->join(); std::string calibration_table_data = calib_engine->calib_->getCalibrationTableAsString(); @@ -609,9 +609,16 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { return false; } + std::string model_opt_cache_dir = + argument_.Has("model_dir") + ? argument_.model_dir() + : inference::analysis::GetDirRoot(argument_.model_program_path()); + std::string calibration_table_data_path = - inference::analysis::GetTrtCalibPath(argument_.model_path(), - engine_name); + inference::analysis::GetTrtCalibPath( + inference::analysis::GetOrCreateModelOptCacheDir( + model_opt_cache_dir), + engine_name); std::ofstream ofile(calibration_table_data_path, std::ios::out); LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file " diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 59bb558722..e83247d39e 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -133,7 +133,8 @@ class TensorRTEngineOp : public framework::OperatorBase { // This process will builds a 32-bit trt engine, runs it on the calibration // set, and records a histogram for each // tensor of the distribution of activation values. - LOG(INFO) << "Running calibration trt int8 ..."; + LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ + << " is running calibration trt int8... "; int runtime_batch = 1; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); From 2a82c5651ea8789c7997643c3840ed934324c4a1 Mon Sep 17 00:00:00 2001 From: Haihao Shen Date: Fri, 25 Jan 2019 19:48:06 +0800 Subject: [PATCH 107/123] Refine INT8 calibration API; shorten the iteration number to reduce test time; test=develop --- .../fluid/contrib/int8_inference/utility.py | 34 +++++++++++++++--- .../fluid/contrib/tests/test_calibration.py | 35 +++++++------------ 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/python/paddle/fluid/contrib/int8_inference/utility.py b/python/paddle/fluid/contrib/int8_inference/utility.py index 197fc5f2d2..40de038f28 100644 --- a/python/paddle/fluid/contrib/int8_inference/utility.py +++ b/python/paddle/fluid/contrib/int8_inference/utility.py @@ -32,10 +32,13 @@ class Calibrator(object): def __init__(self, *args, **kwargs): self.program = kwargs['program'] - self.iterations = kwargs['iterations'] self.pretrained_model = kwargs['pretrained_model'] - self.debug = kwargs['debug'] + self.debug = kwargs['debug'] if 'debug' in kwargs else False self.algo = kwargs['algo'] + self.output = kwargs['output'] + self.feed_var_names = kwargs['feed_var_names'] + self.fetch_list = kwargs['fetch_list'] + self.exe = kwargs['exe'] self._conv_input_var_name = [] self._conv_output_var_name = [] @@ -54,17 +57,38 @@ class Calibrator(object): self._u8_output_var = [] self._s8_output_var = [] self._persistable_vars = [] + self._sampling_data = {} - def generate_sampling_program(self): self.__init_analysis() self.__generate_output_program() - def generate_quantized_data(self, sampling_data): - self.__sampling(sampling_data) + def save_int8_model(self): + self.__sampling(self._sampling_data) self.__save_scale() self.__update_program() self.__update_output_program_attr() self.__display_debug() + self.__save_offline_model() + + def sample_data(self): + ''' + Sampling the tensor data of variable. + ''' + for i in self.sampling_program.list_vars(): + if i.name in self.sampling_vars: + np_data = np.array(fluid.global_scope().find_var(i.name) + .get_tensor()) + if i.name not in self._sampling_data: + self._sampling_data[i.name] = [] + self._sampling_data[i.name].append(np_data) + + def __save_offline_model(self): + ''' + Save the quantized model to the disk. + ''' + fluid.io.save_inference_model(self.output, self.feed_var_names, + self.fetch_list, self.exe, + self.sampling_program) def __display_debug(self): if self.debug: diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py index 17e4eb8b83..ed5ea70260 100644 --- a/python/paddle/fluid/contrib/tests/test_calibration.py +++ b/python/paddle/fluid/contrib/tests/test_calibration.py @@ -26,7 +26,7 @@ import paddle.fluid.profiler as profiler from PIL import Image, ImageEnhance import math sys.path.append('..') -import int8_inference.utility as ut +import int8_inference.utility as int8_utility random.seed(0) np.random.seed(0) @@ -120,13 +120,13 @@ class TestCalibration(unittest.TestCase): def setUp(self): # TODO(guomingz): Put the download process in the cmake. # Download and unzip test data set - imagenet_dl_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz' + imagenet_dl_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz' zip_file_name = imagenet_dl_url.split('/')[-1] cmd = 'rm -rf data {} && mkdir data && wget {} && tar xvf {} -C data'.format( zip_file_name, imagenet_dl_url, zip_file_name) os.system(cmd) # resnet50 fp32 data - resnet50_fp32_model_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz' + resnet50_fp32_model_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz' resnet50_zip_name = resnet50_fp32_model_url.split('/')[-1] resnet50_unzip_folder_name = 'resnet50_fp32' cmd = 'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'.format( @@ -135,8 +135,7 @@ class TestCalibration(unittest.TestCase): resnet50_zip_name, resnet50_unzip_folder_name) os.system(cmd) - self.iterations = 100 - self.skip_batch_num = 5 + self.iterations = 50 def run_program(self, model_path, generate_int8=False, algo='direct'): image_shape = [3, 224, 224] @@ -163,16 +162,15 @@ class TestCalibration(unittest.TestCase): print("Start calibration ...") - calibrator = ut.Calibrator( + calibrator = int8_utility.Calibrator( program=infer_program, pretrained_model=model_path, - iterations=100, - debug=False, - algo=algo) - - sampling_data = {} + algo=algo, + exe=exe, + output=int8_model, + feed_var_names=feed_dict, + fetch_list=fetch_targets) - calibrator.generate_sampling_program() test_info = [] cnt = 0 for batch_id, data in enumerate(val_reader()): @@ -192,13 +190,7 @@ class TestCalibration(unittest.TestCase): feed_dict[1]: label}, fetch_list=fetch_targets) if generate_int8: - for i in calibrator.sampling_program.list_vars(): - if i.name in calibrator.sampling_vars: - np_data = np.array(fluid.global_scope().find_var(i.name) - .get_tensor()) - if i.name not in sampling_data: - sampling_data[i.name] = [] - sampling_data[i.name].append(np_data) + calibrator.sample_data() test_info.append(np.mean(acc1) * len(data)) cnt += len(data) @@ -209,9 +201,8 @@ class TestCalibration(unittest.TestCase): break if generate_int8: - calibrator.generate_quantized_data(sampling_data) - fluid.io.save_inference_model(int8_model, feed_dict, fetch_targets, - exe, calibrator.sampling_program) + calibrator.save_int8_model() + print( "Calibration is done and the corresponding files were generated at {}". format(os.path.abspath("calibration_out"))) From fa286b105265f1e99ef9c5fc26eab169139e2bd5 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 23 Jan 2019 03:27:24 -0800 Subject: [PATCH 108/123] LRN reengineering Added reading dst mem pd from lrn pd coding style fixes test=develop --- paddle/fluid/operators/lrn_mkldnn_op.cc | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 4e4f977fcc..d4325b2c02 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -78,10 +78,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto dims = paddle::framework::vectorize2int(x->dims()); auto src_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - auto dst_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + dims, mkldnn::memory::data_type::f32, x->format()); auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, mkldnn::lrn_across_channels, @@ -92,8 +89,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { k}; auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine}, - static_cast(output_data)}; if (!is_test) { const std::string key = ctx.op().Output("Out"); @@ -110,11 +105,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory->set_data_handle( static_cast(const_cast(input_data))); + auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(), + static_cast(output_data)); auto workspace_memory = insert_to_context( key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -122,8 +122,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory_pd, static_cast(const_cast(input_data))}; auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; + auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), + static_cast(output_data)); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } } }; From 9c3910f3904b30f76cc6dce4ba9b7b61cff8cba7 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 25 Jan 2019 23:44:57 +0800 Subject: [PATCH 109/123] IncreaseBatchBarrier should be in the right condition test=develop --- paddle/fluid/operators/distributed/rpc_server.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 90733fd090..397976fe99 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -56,6 +56,8 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) { void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + // barrier msg should make sure that it's in the right cond(send|recv) + WaitCond(rpc_name); int b = 0; std::unique_lock lock(mutex_); b = ++barrier_counter_[rpc_name]; @@ -124,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { } void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer WaitCond " << rpc_name; + VLOG(3) << "RPCServer WaitCond in " << rpc_name; int cond = 0; { std::unique_lock lock(mutex_); @@ -134,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) { std::unique_lock lock(mutex_); rpc_cond_.wait( lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); + VLOG(3) << "RPCServer WaitCond out " << rpc_name; } void RPCServer::RegisterVar(const std::string& var_name, From 4d13434443bf38369ae61590774a5a9efcc8b673 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 25 Jan 2019 23:46:57 +0800 Subject: [PATCH 110/123] fix a little problem test=develop --- paddle/fluid/operators/distributed/request_handler_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index c609777f0c..a1c5c07774 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -54,7 +54,7 @@ bool RequestSendHandler::Handle(const std::string& varname, // Async if (!sync_mode_) { VLOG(3) << "async process var: " << varname; - if (varname == BATCH_BARRIER_MESSAGE || varname == COMPLETE_MESSAGE) { + if (varname == BATCH_BARRIER_MESSAGE) { PADDLE_THROW( "async mode should not recv BATCH_BARRIER_MESSAGE or " "COMPLETE_MESSAGE"); From da3f9cc5126fb1c3da74ee7073d1c7f843b6a736 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 25 Jan 2019 09:39:21 -0800 Subject: [PATCH 111/123] rm ngraph_operator.cc test=develop --- paddle/fluid/framework/ngraph_operator.cc | 545 ---------------------- 1 file changed, 545 deletions(-) delete mode 100644 paddle/fluid/framework/ngraph_operator.cc diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc deleted file mode 100644 index 7e174c7def..0000000000 --- a/paddle/fluid/framework/ngraph_operator.cc +++ /dev/null @@ -1,545 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include -#include - -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/ngraph_bridge.h" -#include "paddle/fluid/framework/ngraph_operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/framework/var_type.h" - -#include "ngraph/ngraph.hpp" - -namespace paddle { -namespace framework { - -static ngraph::Shape Ddim2Shape(const DDim& dims) { - ngraph::Shape sp; - for (int i = 0; i < dims.size(); ++i) { - int k = dims[i]; - k = k == 0 ? 1 : k; - sp.push_back(k); - } - return sp; -} - -static std::map pd2ng_type_map = { - {proto::VarType::FP32, ngraph::element::f32}, - {proto::VarType::FP64, ngraph::element::f64}, - {proto::VarType::INT32, ngraph::element::i32}, - {proto::VarType::INT64, ngraph::element::i64}, - {proto::VarType::BOOL, ngraph::element::boolean}, -}; - -typedef enum { /* nGraph support state on ops */ - FULL_TRAIN, /* Support full ops for train */ - PARTIAL_TRAIN, /* Support partial ops for train */ - FULL_TEST, /* Support full list of ops for test */ - PARTIAL_TEST /* Support partial list of ops for test */ -} op_state; - -// perform graph build through bridge and execute computation -class NgraphEngine { - public: - explicit NgraphEngine(const Scope& scope, const platform::Place& place, - const std::vector>& ops, - const std::unordered_map< - std::string, ngraph::element::Type>& var_type_map, - const std::unordered_set& persist, - const std::unordered_set& fetches, - const std::unordered_set& post_op_inputs, - op_state ng_op_state) - : scope_(scope), - place_(place), - fused_ops_(ops), - var_type_map_(var_type_map), - persistables_(persist), - fetches_(fetches), - post_op_inputs_(post_op_inputs), - ng_op_state_(ng_op_state) { - var_in_node_map_ = std::make_shared< - std::unordered_map>>(); - - var_node_map_ = std::make_shared< - std::unordered_map>>(); - - BuildNgIO(); - - GetNgFunction(); - } - - void Run(const Scope& scope, const platform::Place& place) const; - - private: - static std::unordered_map> - func_cache_; - const Scope& scope_; - const platform::Place& place_; - std::vector> fused_ops_; - std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; - std::unordered_set post_op_inputs_; - op_state ng_op_state_; - - // ngraph backend eg. CPU - static std::shared_ptr backend_; - // ngraph function to call and execute - std::shared_ptr ngraph_function_; - // var_name of inputs - std::vector var_in_; - // var_name of outputs from fetch in order - std::vector var_out_; - // map input vars to nodes - std::shared_ptr< - std::unordered_map>> - var_in_node_map_; - // map each var name with a ngraph node - std::shared_ptr< - std::unordered_map>> - var_node_map_; - // cache key to check if function is cached - std::shared_ptr GetCacheKey(); - // get ngraph input and define ngraph input parameters - void GetNgInputShape(std::shared_ptr op); - // Call ngraph bridge to map ops - void BuildNgNodes(); - // get the ngraph input and output var list - void BuildNgIO(); - // build ngraph function call - void BuildNgFunction(); - // Check cache for ngraph function or otherwise build the function - void GetNgFunction(); -}; - -std::vector>::iterator>> -NgraphOperator::NgraphOpIntervals( - std::vector>* ops) { - std::vector>::iterator>> - intervals; - if (ops->empty()) { - return intervals; - } - size_t size = ops->size(); - size_t left = 0; - while (left < size && ops->at(left)->Type() != kFeedOpType) { - ++left; - } - if (left == size) { - return intervals; - } - while (left < size && ops->at(left)->Type() == kFeedOpType) { - ++left; - } - - size_t right = left; - while (right < size && ops->at(right)->Type() != kFetchOpType) { - ++right; - } - if (right == size) { - return intervals; - } - if (left >= right) return intervals; - - // (left, right - 1) represents indices between feed and fetch - size_t pivot = left; - while (pivot < right) { - auto op_type = ops->at(pivot)->Type(); - if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == - paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { - ++pivot; - } else { - size_t start = pivot, end = start; - while (pivot < right && - (paddle::framework::NgraphBridge::NG_NODE_MAP.find( - ops->at(pivot)->Type()) != - paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { - ++pivot; - ++end; - } - std::vector>::iterator> - interval = {ops->begin() + start, ops->begin() + end}; - intervals.push_back(interval); - } - } // end while - - return intervals; -} - -NgraphOperator::NgraphOperator( - const ProgramDesc& prog, size_t block_id, - std::vector>::iterator start, - std::vector>::iterator end, - const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs), - pdesc_(prog), - block_(block_id) { - for (std::vector>::iterator it = start; - it != end; ++it) { - fused_ops_.push_back(std::move(*it)); - } - - for (std::vector>::iterator it = end; - (*it)->Type() != kFetchOpType; ++it) { - for (auto& var_name_item : (*it)->Inputs()) { - for (auto& var_name : var_name_item.second) { - post_op_inputs_.insert(var_name); - } - } - } - - if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { - is_full_ = true; - } - - Process(); -} - -void NgraphOperator::Process() { - auto& bdesc = pdesc_.Block(block_); - for (auto& var : bdesc.AllVars()) { - if (!(var->GetType() == proto::VarType::SELECTED_ROWS || - var->GetType() == proto::VarType::LOD_TENSOR || - var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) { - continue; - } - - auto var_name = var->Name(); - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var_name != "fetch" && var_name != "feed") { - auto pd_type = var->GetDataType(); - if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { - PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", - var_name); - } - var_type_map_[var_name] = pd2ng_type_map[pd_type]; - } - - if (var->Persistable()) { - persistables_.insert(var->Name()); - } - } - - for (auto* op : bdesc.AllOps()) { - if (op->Type() == kFetchOpType) { - std::string fetch_target_name = op->Input("X")[0]; - fetches_.insert(fetch_target_name); - } - } -} - -void NgraphOperator::RunImpl(const Scope& scope, - const platform::Place& place) const { - op_state ng_op_state = PARTIAL_TEST; - auto& bdesc = pdesc_.Block(block_); - for (auto* op : bdesc.AllOps()) { - if (op->Type().find("_grad") != std::string::npos) { - ng_op_state = PARTIAL_TRAIN; - break; - } - } - - if (is_full_) { - ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; - } - - NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_, - persistables_, fetches_, post_op_inputs_, - ng_op_state); - ngraph_engine.Run(scope, place); -} - -std::unordered_map> - NgraphEngine::func_cache_ = {}; - -std::shared_ptr NgraphEngine::backend_ = - ngraph::runtime::Backend::create("CPU"); - -void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); - op->RuntimeInferShape(scope_, place_, ctx); - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto sp = Ddim2Shape(tensor_pd->dims()); - if (std::find(var_in_.begin(), var_in_.end(), var_name) != - var_in_.end()) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, sp, true); - (*var_node_map_)[var_name] = prm; - (*var_in_node_map_)[var_name] = prm; - } - } - } - } - } -} - -void NgraphEngine::BuildNgNodes() { - for (auto& var_name : var_out_) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - auto ng_shape = Ddim2Shape(ddim); - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, ng_shape, true); - (*var_node_map_)[var_name] = prm; - } - } - } - - paddle::framework::NgraphBridge ngb(var_node_map_); - for (auto& op : fused_ops_) { - ngb.BuildNgNode(op); - } -} - -void NgraphEngine::BuildNgIO() { - std::unordered_set inputs; - std::unordered_set outputs; - - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - inputs.insert(var_name); - const bool is_output = outputs.find(var_name) != outputs.end(); - if (!is_output && - std::find(var_in_.begin(), var_in_.end(), var_name) == - var_in_.end()) { - // fill var_in here to keep lhs and rhs order - var_in_.push_back(var_name); - } - } - } - - if (op->Type() != "fill_constant") { - GetNgInputShape(op); - } - - for (auto& var_name_item : op->Outputs()) { - PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, - "op %s has more than 1 output - Not handling yet", - op->Type()); - for (auto& var_name : var_name_item.second) { - outputs.insert(var_name); - } - } - } - - // var_out.clear(); - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Outputs()) { - PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, - "op %s has more than 1 output - Not handling yet", - op->Type()); - for (auto& var_name : var_name_item.second) { - switch (ng_op_state_) { - case PARTIAL_TEST: - if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || - fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); - } - break; - case FULL_TEST: - if (fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); - } - break; - case PARTIAL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || - post_op_inputs_.find(var_name) != post_op_inputs_.end() || - persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); - } - break; - case FULL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || - persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); - } - break; - default: - var_out_.push_back(var_name); - } - } - } - } -} - -void NgraphEngine::BuildNgFunction() { - BuildNgNodes(); - ngraph_function_ = nullptr; - ngraph::NodeVector func_outputs; - ngraph::ParameterVector func_inputs; - - for (auto& vo : var_out_) { - func_outputs.push_back(var_node_map_->at(vo)); - } - - for (auto& vi : var_in_) { - std::shared_ptr prm = - std::dynamic_pointer_cast( - var_in_node_map_->at(vi)); - func_inputs.push_back(prm); - } - - ngraph_function_ = - std::make_shared(func_outputs, func_inputs); -} - -std::shared_ptr NgraphEngine::GetCacheKey() { - auto cache_key = std::make_shared(""); - *cache_key += std::to_string(fused_ops_.size()); - for (auto& op : fused_ops_) { - *cache_key += op->Type(); - } - for (auto& var_name : var_in_) { - auto shape = var_node_map_->at(var_name)->get_shape(); - *cache_key += var_name; - *cache_key += var_type_map_.at(var_name).c_type_string(); - for (size_t i = 0; i < shape.size(); ++i) { - *cache_key += std::to_string(shape.at(i)); - } - } - - for (auto& var_name : var_out_) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - for (int i = 0; i < ddim.size(); ++i) { - *cache_key += std::to_string(ddim[i]); - } - } - } - return cache_key; -} - -void NgraphEngine::GetNgFunction() { - bool cache_on = true; - if (cache_on) { - std::string cache_key_val = *GetCacheKey(); - if (func_cache_.find(cache_key_val) != func_cache_.end()) { - ngraph_function_ = func_cache_.at(cache_key_val); - } else { - BuildNgFunction(); - func_cache_[cache_key_val] = ngraph_function_; - } - } else { - BuildNgFunction(); - } -} - -void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { - std::vector> t_in; - std::vector> t_out; - - for (size_t i = 0; i < var_in_.size(); ++i) { - auto vi = var_in_.at(i); - auto sp = var_node_map_->at(vi)->get_shape(); - std::shared_ptr ti; - auto* var = scope.FindVar(vi); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), - "Ensure ngraph tensor layout align with paddle tensor"); - if (tensor_pd->type() == proto::VarType::FP32) { - const float* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::f32, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::INT32) { - const int* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i32, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::INT64) { - const int64_t* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i64, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::FP64) { - const double* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::f64, sp, - const_cast(arr)); - } else if (tensor_pd->type() == proto::VarType::BOOL) { - const bool* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::boolean, sp, - const_cast(arr)); - } else { - PADDLE_THROW("Data type not handling for var %s", vi); - } - } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", vi); - } - bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST) - ? true - : false; - bool is_persistable = - (persistables_.find(vi) != persistables_.end()) ? true : false; - if (is_test && is_persistable) { - ti->set_stale(false); - } - t_in.push_back(ti); - } - - for (size_t i = 0; i < var_out_.size(); ++i) { - auto var_name = var_out_[i]; - auto* var = scope.FindVar(var_name); - std::shared_ptr to; - if (var && var->IsType()) { - auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); - auto dd = tensor_pd->dims(); - ngraph::Shape sp = Ddim2Shape(dd); - auto ng_type = var_type_map_.at(var_name); - if (ng_type == ngraph::element::f32) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); - } else if (ng_type == ngraph::element::i64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); - } else if (ng_type == ngraph::element::f64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); - } else if (ng_type == ngraph::element::boolean) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); - } else { - PADDLE_THROW("Data type not handled in for var %s", var_name); - } - t_out.push_back(to); - } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", var_name); - } - } - - backend_->call(backend_->compile(ngraph_function_), t_out, t_in); -} // NgraphEngine::RunImpl -} // namespace framework -} // namespace paddle From 8e9308a51a8857e2cb74477952a152132a597065 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Fri, 25 Jan 2019 11:36:35 -0800 Subject: [PATCH 112/123] mv ngraph_bridge to ngraph directory test=develop --- paddle/fluid/framework/CMakeLists.txt | 4 --- paddle/fluid/operators/ngraph/CMakeLists.txt | 1 + .../ngraph}/ngraph_bridge.cc | 36 +++++++++---------- .../ngraph}/ngraph_bridge.h | 12 +++---- .../fluid/operators/ngraph/ngraph_engine.cc | 13 ++++--- 5 files changed, 31 insertions(+), 35 deletions(-) rename paddle/fluid/{framework => operators/ngraph}/ngraph_bridge.cc (55%) rename paddle/fluid/{framework => operators/ngraph}/ngraph_bridge.h (84%) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8cb0c4e668..2ba2437de6 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -129,10 +129,6 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -if(WITH_NGRAPH) - cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) -endif(WITH_NGRAPH) - cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt index 83f78d505d..6b256ef026 100644 --- a/paddle/fluid/operators/ngraph/CMakeLists.txt +++ b/paddle/fluid/operators/ngraph/CMakeLists.txt @@ -1,4 +1,5 @@ if(WITH_NGRAPH) + cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto) op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context) endif() diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc similarity index 55% rename from paddle/fluid/framework/ngraph_bridge.cc rename to paddle/fluid/operators/ngraph/ngraph_bridge.cc index 365870c54e..d6e897ed46 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -17,39 +17,39 @@ limitations under the License. */ #include #include "ngraph/ngraph.hpp" -#include "paddle/fluid/framework/ngraph_bridge.h" -#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_ops.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { -namespace framework { +namespace operators { namespace NG_OPS = paddle::operators::ngraphs; std::map&, + std::function&, std::shared_ptr>>)>> NgraphBridge::NG_NODE_MAP = { {"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, - {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, - {"mean", paddle::operators::ngraphs::BuildMeanNode}, - {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode}, - {"mul", paddle::operators::ngraphs::BuildMulNode}, - {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, - {"softmax", paddle::operators::ngraphs::BuildSoftmaxNode}, - {"softmax_grad", paddle::operators::ngraphs::BuildSoftmaxGradNode}, - {"scale", paddle::operators::ngraphs::BuildScaleNode}, - {"relu", paddle::operators::ngraphs::BuildUnaryNode}, - {"tanh", paddle::operators::ngraphs::BuildUnaryNode}, - {"top_k", paddle::operators::ngraphs::BuildTopKNode}}; - -void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { + {"fill_constant", NG_OPS::BuildFillConstantNode}, + {"mean", NG_OPS::BuildMeanNode}, + {"mean_grad", NG_OPS::BuildMeanGradNode}, + {"mul", NG_OPS::BuildMulNode}, + {"mul_grad", NG_OPS::BuildMulGradNode}, + {"softmax", NG_OPS::BuildSoftmaxNode}, + {"softmax_grad", NG_OPS::BuildSoftmaxGradNode}, + {"scale", NG_OPS::BuildScaleNode}, + {"relu", NG_OPS::BuildUnaryNode}, + {"tanh", NG_OPS::BuildUnaryNode}, + {"top_k", NG_OPS::BuildTopKNode}}; + +void NgraphBridge::BuildNgNode( + const std::shared_ptr& op) { auto& op_type = op->Type(); NG_NODE_MAP[op_type](op, ngb_node_map_); } -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h similarity index 84% rename from paddle/fluid/framework/ngraph_bridge.h rename to paddle/fluid/operators/ngraph/ngraph_bridge.h index 5ad7b8daeb..c57988f8f6 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h @@ -21,16 +21,16 @@ limitations under the License. */ #include "ngraph/node.hpp" -namespace paddle { -namespace framework { +#include "paddle/fluid/framework/operator.h" -class OperatorBase; +namespace paddle { +namespace operators { class NgraphBridge { public: static std::map< std::string, - std::function&, + std::function&, std::shared_ptr>>)>> NG_NODE_MAP; @@ -41,7 +41,7 @@ class NgraphBridge { var_node_map) : ngb_node_map_(var_node_map) {} - void BuildNgNode(const std::shared_ptr& op); + void BuildNgNode(const std::shared_ptr& op); private: std::shared_ptr< @@ -49,5 +49,5 @@ class NgraphBridge { ngb_node_map_; }; -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc index fde3a5ba55..bec4b514a2 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.cc +++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc @@ -24,11 +24,11 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h" namespace paddle { @@ -88,15 +88,14 @@ static std::vector> NgraphOpIntervals( int pivot = left; while (pivot < right) { auto op_type = ops.at(pivot)->Type(); - if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == - paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { + if (NgraphBridge::NG_NODE_MAP.find(op_type) == + NgraphBridge::NG_NODE_MAP.end()) { ++pivot; } else { int start = pivot, end = start; while (pivot < right && - (paddle::framework::NgraphBridge::NG_NODE_MAP.find( - ops.at(pivot)->Type()) != - paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { + (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) != + NgraphBridge::NG_NODE_MAP.end())) { ++pivot; ++end; } @@ -283,7 +282,7 @@ void NgraphEngine::BuildNgNodes() { } } } - framework::NgraphBridge ngb(var_node_map_); + NgraphBridge ngb(var_node_map_); for (auto& op : fused_ops_) { ngb.BuildNgNode(op); } From e2818c8608a6116244d7c890e199f5bf652f7712 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Sat, 26 Jan 2019 16:20:18 +0800 Subject: [PATCH 113/123] add dynamic memory optim (#15457) --- paddle/fluid/inference/analysis/argument.h | 4 +- .../analysis/passes/memory_optimize_pass.cc | 188 +++++++++++------- .../analysis/passes/memory_optimize_pass.h | 2 +- paddle/fluid/inference/api/analysis_config.cc | 24 ++- .../fluid/inference/api/analysis_predictor.cc | 24 ++- .../fluid/inference/api/analysis_predictor.h | 2 +- .../inference/api/paddle_analysis_config.h | 18 +- .../tests/api/analyzer_dam_tester.cc | 24 ++- 8 files changed, 181 insertions(+), 105 deletions(-) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 88ce61f9b9..71c4a54dea 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -133,7 +133,9 @@ struct Argument { // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); - DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool); + DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool); + DECL_ARGUMENT_FIELD(static_memory_optim_force_update, + StaticMemoryOptimForceUpdate, bool); // Indicate which kind of sort algorithm is used for operators, the memory // optimization relays on the sort algorithm. DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 57683c0b72..3d1be9196f 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -444,6 +444,26 @@ std::vector>> DeseralizeBatchVarShapes( return batch_shapes; } +// Replace the -1 in shape to a real number to fake the shape. +std::vector>> FakeBatchVarShapes( + const framework::ProgramDesc& program) { + std::vector>> res; + res.emplace_back(); + auto& record = res.front(); + const int fake_batch_size = 3; + for (auto* var : program.Block(0).AllVars()) { + if (var->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + auto shape = var->GetShape(); + for (auto& v : shape) { + if (v < 0) v = fake_batch_size; + } + record[var->Name()].assign(shape.begin(), shape.end()); + } + } + return res; +} + // Calculate the average dim of each tensor from the batch shape cache. std::unordered_map GetBatchAverageSize( const std::vector>>& batches) { @@ -478,6 +498,7 @@ std::vector> AnalysisBatchShapesByBatchSize( std::unordered_map var_batchsize_hashes; for (auto& batch : batches) { for (auto& ele : batch) { + PADDLE_ENFORCE(!ele.second.empty()); int batch_size = ele.second.front(); // TODO(Superjomn) might consume large memory here, use combine hash. var_batchsize_hashes[ele.first] << batch_size; @@ -538,9 +559,21 @@ std::vector> AnalysisBatchShapesBySimilarSize( std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } +std::pair GetRange( + const std::unordered_map& ave_size) { + auto res = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::min()); + for (auto& item : ave_size) { + res.first = std::min(item.second, res.first); + res.second = std::max(item.second, res.second); + } + return res; +} + void MemoryOptimizePass::RunImpl(Argument* argument) { // When force update, should not optimize memory. - if (!argument->enable_memory_optim() || argument->memory_optim_force_update()) + if (!argument->enable_memory_optim() || + argument->static_memory_optim_force_update()) return; graph_ = argument->main_graph_ptr(); @@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { argument->model_program_path_valid() ? argument->model_program_path() : ""); VLOG(3) << "Load memory cache from " << path; - if (inference::IsFileExists(path)) { - VLOG(4) << "Performing memory optimize"; - auto batches = DeseralizeBatchVarShapes(path); - auto var_batch_ave_size = GetBatchAverageSize(batches); + std::vector>> batches; + + if (argument->static_memory_optim() && inference::IsFileExists(path)) { + string::PrettyLogInfo("--- Performing static memory optimize"); + batches = DeseralizeBatchVarShapes(path); + } else { + string::PrettyLogInfo("--- Performing dynamic memory optimize"); + batches = FakeBatchVarShapes(argument->main_program()); + } + auto var_batch_ave_size = GetBatchAverageSize(batches); + + // Get min and max memory size. + const auto range = GetRange(var_batch_ave_size); + const int cluster_size = std::max( + static_cast((range.second - range.first) / 100 /*cluster num*/), + 1024); + const int cluster_size1 = std::max( + static_cast((range.second - range.first) / 1000 /*cluster num*/), + 1024); - std::unordered_map tensor_nodes; - space_table_t space_table; - CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); + std::unordered_map tensor_nodes; + space_table_t space_table; + CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); - std::unordered_map reuse_table; - double max_saving_ratio = 0.; + std::unordered_map reuse_table; + double max_saving_ratio = 0.; - std::vector> strategies; + std::vector> strategies; - for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + if (argument->static_memory_optim()) { + // This strategy only make scene in static memory optimize. strategies.emplace_back([&, sort_kind] { auto clustered_vars_by_batch_size = AnalysisBatchShapesByBatchSize(batches); @@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { space_table, &reuse_table, sort_kind, &allocation); return allocation; }); + } - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, 1024); // interval 1kb - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, + std::numeric_limits::max()); // no intervals + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + } - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, 1024 * 1024); // interval 1MB - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + std::function* best_strategy{nullptr}; - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, - std::numeric_limits::max()); // no intervals - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + // Try all strategies to get the best result. + for (auto& strategy : strategies) { + auto allocation = strategy(); + string::PrettyLogDetail("--- get strategy saving %f memory for workspace", + allocation.GetSavingRatio()); + if (allocation.GetSavingRatio() > max_saving_ratio) { + max_saving_ratio = allocation.GetSavingRatio(); + best_strategy = &strategy; } + } + if (!best_strategy) { + LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize"; + return; + } + auto memory_allocation = (*best_strategy)(); - std::function* best_strategy{nullptr}; + string::PrettyLogInfo( + "--- Saved %.2f%s memory for workspace(temporary variables)", + memory_allocation.GetSavingRatio() * 100, "%"); - // Try all strategies to get the best result. - for (auto& strategy : strategies) { - auto allocation = strategy(); - string::PrettyLogDetail("--- get strategy saving %f memory for workspace", - allocation.GetSavingRatio()); - if (allocation.GetSavingRatio() > max_saving_ratio) { - max_saving_ratio = allocation.GetSavingRatio(); - best_strategy = &strategy; - } - } - if (!best_strategy) { - LOG(ERROR) - << "This model makes poor memory optimize, skip memory optimize"; - return; - } - auto memory_allocation = (*best_strategy)(); - - string::PrettyLogH2( - "--- Saved %.2f%s memory for workspace(temporary variables)", - memory_allocation.GetSavingRatio() * 100, "%"); - string::PrettyLogDetail("--- Allocated %d MB", - memory_allocation.allocated / 1024. / 1024.); - string::PrettyLogDetail("--- Saved %d MB", - memory_allocation.saved / 1024. / 1024.); - argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, - new std::unordered_set); - auto& vars2remove = - argument->main_graph().Get>( - framework::ir::kGraphToProgramVarsToRemove); - - PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); - argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); - } + argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, + new std::unordered_set); + auto& vars2remove = + argument->main_graph().Get>( + framework::ir::kGraphToProgramVarsToRemove); + + PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); + argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); } float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index fa1ad9c8c6..216f416de0 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index f9da3004ed..e6008ba335 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -95,7 +95,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(enable_memory_optim_); - CP_MEMBER(memory_optim_force_update_); + CP_MEMBER(static_memory_optim_); + CP_MEMBER(static_memory_optim_force_update_); // TensorRT releated. CP_MEMBER(use_tensorrt_); CP_MEMBER(tensorrt_workspace_size_); @@ -238,7 +239,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ss << tensorrt_min_subgraph_size_; ss << enable_memory_optim_; - ss << memory_optim_force_update_; + ss << static_memory_optim_; + ss << static_memory_optim_force_update_; ss << use_mkldnn_; for (auto &item : mkldnn_enabled_op_types_) ss << item; @@ -278,9 +280,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #endif } -void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) { +void contrib::AnalysisConfig::EnableMemoryOptim( + bool static_optim, bool force_update_static_cache) { enable_memory_optim_ = true; - memory_optim_force_update_ = force_update_cache; + static_memory_optim_ = static_optim; + static_memory_optim_force_update_ = force_update_static_cache; Update(); } @@ -300,4 +304,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, Update(); } +NativeConfig contrib::AnalysisConfig::ToNativeConfig() const { + NativeConfig config; + config.model_dir = model_dir_; + config.prog_file = prog_file_; + config.param_file = params_file_; + config.use_gpu = use_gpu_; + config.device = device_id_; + config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); + config.specify_input_name = specify_input_name_; + return config; +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2b0cad5faa..9f8a78f7ab 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -298,15 +298,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, bool AnalysisPredictor::GetFetch(std::vector *outputs, framework::Scope *scope) { VLOG(3) << "Predictor::get_fetch"; - outputs->resize(fetchs_.size()); - for (size_t i = 0; i < fetchs_.size(); ++i) { - int idx = boost::get(fetchs_[i]->GetAttr("col")); + outputs->resize(fetches_.size()); + for (size_t i = 0; i < fetches_.size(); ++i) { + int idx = boost::get(fetches_[i]->GetAttr("col")); PADDLE_ENFORCE((size_t)idx == i); framework::LoDTensor &fetch = framework::GetFetchVariable(*scope, "fetch", idx); auto type = fetch.type(); auto output = &(outputs->at(i)); - output->name = fetchs_[idx]->Input("X")[0]; + output->name = fetches_[idx]->Input("X")[0]; if (type == framework::proto::VarType::FP32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; @@ -327,7 +327,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); - argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); + argument_.SetStaticMemoryOptim(config_.static_memory_optim_); + argument_.SetStaticMemoryOptimForceUpdate( + config_.static_memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir().empty()) { @@ -422,10 +424,10 @@ void AnalysisPredictor::PrepareFeedFetch() { feed_names_[op->Output("Out")[0]] = idx; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); - if (fetchs_.size() <= static_cast(idx)) { - fetchs_.resize(idx + 1); + if (fetches_.size() <= static_cast(idx)) { + fetches_.resize(idx + 1); } - fetchs_[idx] = op; + fetches_[idx] = op; } } } @@ -638,12 +640,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { // check if the cache exists if (!config_.enable_memory_optim()) { need = false; - } else if (config_.enable_memory_optim() && + } else if (config_.static_memory_optim_ && !inference::IsFileExists(inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()))) { need = true; - } else if (config_.enable_memory_optim() && - config_.memory_optim_force_update_) { + } else if (config_.static_memory_optim_ && + config_.static_memory_optim_force_update_) { need = true; } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 9095b6ec1a..a8ea67d4bd 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -115,7 +115,7 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr inference_program_; std::vector feeds_; std::map feed_names_; - std::vector fetchs_; + std::vector fetches_; // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 1cee890450..f89eaeaadc 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -162,17 +162,7 @@ struct AnalysisConfig { /** Transform the AnalysisConfig to NativeConfig. */ - NativeConfig ToNativeConfig() const { - NativeConfig config; - config.model_dir = model_dir_; - config.prog_file = prog_file_; - config.param_file = params_file_; - config.use_gpu = use_gpu_; - config.device = device_id_; - config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); - config.specify_input_name = specify_input_name_; - return config; - } + NativeConfig ToNativeConfig() const; /** Specify the operator type list to use MKLDNN acceleration. * @param op_list the operator type list. */ @@ -195,7 +185,8 @@ struct AnalysisConfig { /** Turn on memory optimize * NOTE still in development, will release latter. */ - void EnableMemoryOptim(bool force_update_cache = false); + void EnableMemoryOptim(bool static_optim = false, + bool force_update_static_cache = false); /** Tell whether the memory optimization is activated. */ bool enable_memory_optim() const; @@ -241,7 +232,8 @@ struct AnalysisConfig { // memory reuse related. bool enable_memory_optim_{false}; - bool memory_optim_force_update_{false}; + bool static_memory_optim_{false}; + bool static_memory_optim_force_update_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 4ec9404ab4..e78ab942d1 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) { } // Compare result of NativeConfig and AnalysisConfig with memory optimization. -TEST(Analyzer_dam, compare_with_memory_optim) { +TEST(Analyzer_dam, compare_with_static_memory_optim) { // The small dam will core in CI, but works in local. if (FLAGS_max_turn_num == 9) { contrib::AnalysisConfig cfg, cfg1; @@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { SetInput(&input_slots_all); // Run the first time to force to update memory cache SetConfig(&cfg); - cfg.EnableMemoryOptim(true); + cfg.EnableMemoryOptim(true, true /*force update*/); CompareNativeAndAnalysis( reinterpret_cast(&cfg), @@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { // Run second time to use the memory cache and perform memory optimization. SetConfig(&cfg1); - cfg1.EnableMemoryOptim(); + cfg1.EnableMemoryOptim(true, false /*do not force update*/); CompareNativeAndAnalysis( reinterpret_cast(&cfg1), @@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) { } } +TEST(Analyzer_dam, compare_with_dynamic_memory_optim) { + // The small dam will core in CI, but works in local. + if (FLAGS_max_turn_num == 9) { + contrib::AnalysisConfig cfg, cfg1; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + // Run the first time to force to update memory cache + SetConfig(&cfg); + cfg.EnableMemoryOptim(); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); + } +} + TEST(Analyzer_dam, compare) { compare(); } #ifdef PADDLE_WITH_MKLDNN From b43ea40c51f756ecdf46aea74ff7c290a0f41601 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Sat, 26 Jan 2019 09:24:09 +0000 Subject: [PATCH 114/123] delete the usage of the const_cast test=develop --- paddle/fluid/inference/analysis/ir_pass_manager.cc | 6 ++---- paddle/fluid/operators/tensorrt/tensorrt_engine_op.h | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 403ebfe72a..99611ce84b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -67,10 +67,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", new int(argument->tensorrt_min_subgraph_size())); - pass->Set( - "program", - new framework::ProgramDesc *( - const_cast(&argument->main_program()))); + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); bool enable_int8 = argument->tensorrt_precision_mode() == contrib::AnalysisConfig::Precision::kInt8; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index e83247d39e..2ff35c7c6a 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -114,9 +114,9 @@ class TensorRTEngineOp : public framework::OperatorBase { framework::Executor executor(dev_place); auto *block = Attr("sub_block"); auto *program = block->Program(); - auto *scope_ptr = const_cast(&scope); + auto ¤t_scope = scope.NewScope(); auto ctx = executor.Prepare(*program, block->ID()); - executor.RunPreparedContext(ctx.get(), scope_ptr, false, true, true); + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, true); } void RunImpl(const framework::Scope &scope, From d303270a0e3a640da1abc75936179c75250ba3e9 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sat, 26 Jan 2019 21:23:53 +0800 Subject: [PATCH 115/123] revert test=develop (#15535) --- paddle/fluid/framework/details/execution_strategy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 2edb50e0d8..318694a1d4 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -28,7 +28,7 @@ struct ExecutionStrategy { // If we set this to 1, we will delete all variables when finish a batch. and // this will loss 15%+ performance. // Please be aware about this parameters. - size_t num_iteration_per_drop_scope_{100}; + size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; From 806658d72b5cd2589bf1e2993f3621d520c9a075 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 27 Jan 2019 08:41:23 +0800 Subject: [PATCH 116/123] add space after colon in commnet test=develop --- paddle/fluid/operators/distributed/rpc_server.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 397976fe99..c3a46e348c 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -111,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name, static int cond = -1; rpc_cond_map_[rpc_name] = ++cond; - VLOG(3) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler - << ", cond:" << rpc_cond_map_[rpc_name]; + VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler + << ", cond: " << rpc_cond_map_[rpc_name]; } void RPCServer::SetCond(const std::string& rpc_name) { From 4aa7ef3c1310427291371b3d1831d3e6adfeee33 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Sun, 27 Jan 2019 12:37:19 +0100 Subject: [PATCH 117/123] - Compensation fix to LRN MKL-DNN op test=develop --- paddle/fluid/inference/tests/api/CMakeLists.txt | 5 +++++ paddle/fluid/operators/lrn_mkldnn_op.cc | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index adbf98e9e8..989a9e275f 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -54,6 +54,7 @@ else() message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1") endif() + # RNN2 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") @@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) endif() inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL) +# googlenet +inference_analysis_api_test_with_fake_data(test_analyzer_googlenet + "${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL) + # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index d4325b2c02..692933405b 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -67,7 +67,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { mid->mutable_data(ctx.GetPlace()); const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha"); + const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); const bool is_test = ctx.Attr("is_test"); @@ -156,7 +156,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key_workspace_memory = key + "@lrn_workspace_memory"; const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha"); + const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); From 5885c5cdf64571933ca4be9567908c7b5203c379 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Sun, 27 Jan 2019 12:46:09 +0100 Subject: [PATCH 118/123] - Added explanation to LRN MKL-DNN op on alpha modification test=develop --- paddle/fluid/operators/lrn_mkldnn_op.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 692933405b..097ba01d40 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -67,6 +67,12 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { mid->mutable_data(ctx.GetPlace()); const int n = ctx.Attr("n"); + // MKL-DNN implements LRN in a caffe way: + // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html + // Where sum of squares is divided by size of normalization window + // this is not the case for PaddlePaddle LRN. + // Hence we need to compensate for this diffrence by + // multipliing alpha by size of window(n) const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); From aeca5c50b20dfa69509b14b820a66c83a1921eff Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Sun, 27 Jan 2019 20:08:55 -0600 Subject: [PATCH 119/123] fix grid_sampler PADDLE_ENFORCE error. test=develop (#15542) --- paddle/fluid/operators/grid_sampler_op.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 14a2524bd8..241184c6f4 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2."); PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0], "Input(X) and Input(Grid) dims[0] should be equal."); - PADDLE_ENFORCE_EQ( - grid_dims[1], x_dims[2], - "Input(X) dims[2] and Input(Grid) dims[1] should be equal."); - PADDLE_ENFORCE_EQ( - grid_dims[2], x_dims[3], - "Input(X) dims[3] and Input(Grid) dims[2] should be equal."); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + grid_dims[1], x_dims[2], + "Input(X) dims[2] and Input(Grid) dims[1] should be equal."); + PADDLE_ENFORCE_EQ( + grid_dims[2], x_dims[3], + "Input(X) dims[3] and Input(Grid) dims[2] should be equal."); + } ctx->SetOutputDim("Output", x_dims); ctx->ShareLoD("X", "Output"); From f82515800c0c0f3b85f1dfaf56fb5690e4c70681 Mon Sep 17 00:00:00 2001 From: Haihao Shen Date: Mon, 28 Jan 2019 11:11:42 +0800 Subject: [PATCH 120/123] Enable INT8 Calibration Unit Test for MobileNet-V1 (#15539) * Enable mobilenet UT in separate test class; use download cache by paddle download utility and cache unzip; and fix typo; test=develop * Extract cache_unzipping function for reuse; format code style; test=develop * Simplify the test code by define a combined function for both downloading and unzipping; test=develop --- .../fluid/contrib/tests/test_calibration.py | 76 ++++++++++++++----- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py index ed5ea70260..f07fefe7e0 100644 --- a/python/paddle/fluid/contrib/tests/test_calibration.py +++ b/python/paddle/fluid/contrib/tests/test_calibration.py @@ -23,6 +23,7 @@ import argparse import functools import contextlib import paddle.fluid.profiler as profiler +from paddle.dataset.common import download from PIL import Image, ImageEnhance import math sys.path.append('..') @@ -116,27 +117,44 @@ def val(data_dir=DATA_DIR): return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir) -class TestCalibration(unittest.TestCase): +class TestCalibrationForResnet50(unittest.TestCase): def setUp(self): - # TODO(guomingz): Put the download process in the cmake. - # Download and unzip test data set - imagenet_dl_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz' - zip_file_name = imagenet_dl_url.split('/')[-1] - cmd = 'rm -rf data {} && mkdir data && wget {} && tar xvf {} -C data'.format( - zip_file_name, imagenet_dl_url, zip_file_name) - os.system(cmd) - # resnet50 fp32 data - resnet50_fp32_model_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz' - resnet50_zip_name = resnet50_fp32_model_url.split('/')[-1] - resnet50_unzip_folder_name = 'resnet50_fp32' - cmd = 'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'.format( - resnet50_unzip_folder_name, resnet50_zip_name, - resnet50_unzip_folder_name, resnet50_fp32_model_url, - resnet50_zip_name, resnet50_unzip_folder_name) + self.int8_download = 'int8/download' + self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' + + self.int8_download) + + data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz' + data_md5 = '1b6c1c434172cca1bf9ba1e4d7a3157d' + self.data_cache_folder = self.download_data(data_url, data_md5, "data") + + # reader/decorator.py requires the relative path to the data folder + cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data", + self.data_cache_folder) os.system(cmd) self.iterations = 50 + def cache_unzipping(self, target_folder, zip_path): + if not os.path.exists(target_folder): + cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, + zip_path) + os.system(cmd) + + def download_data(self, data_url, data_md5, folder_name): + download(data_url, self.int8_download, data_md5) + data_cache_folder = os.path.join(self.cache_folder, folder_name) + file_name = data_url.split('/')[-1] + zip_path = os.path.join(self.cache_folder, file_name) + self.cache_unzipping(data_cache_folder, zip_path) + return data_cache_folder + + def download_resnet50_model(self): + # resnet50 fp32 data + data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz' + data_md5 = '4a5194524823d9b76da6e738e1367881' + self.model_cache_folder = self.download_data(data_url, data_md5, + "resnet50_fp32") + def run_program(self, model_path, generate_int8=False, algo='direct'): image_shape = [3, 224, 224] os.environ['FLAGS_use_mkldnn'] = 'True' @@ -204,14 +222,32 @@ class TestCalibration(unittest.TestCase): calibrator.save_int8_model() print( - "Calibration is done and the corresponding files were generated at {}". + "Calibration is done and the corresponding files are generated at {}". format(os.path.abspath("calibration_out"))) else: return np.sum(test_info) / cnt - def test_calibration_for_resnet50(self): - fp32_acc1 = self.run_program("resnet50_fp32/model") - self.run_program("resnet50_fp32/model", True) + def test_calibration(self): + self.download_resnet50_model() + fp32_acc1 = self.run_program(self.model_cache_folder + "/model") + self.run_program(self.model_cache_folder + "/model", True) + int8_acc1 = self.run_program("calibration_out") + delta_value = np.abs(fp32_acc1 - int8_acc1) + self.assertLess(delta_value, 0.01) + + +class TestCalibrationForMobilenetv1(TestCalibrationForResnet50): + def download_mobilenetv1_model(self): + # mobilenetv1 fp32 data + data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz' + data_md5 = '13892b0716d26443a8cdea15b3c6438b' + self.model_cache_folder = self.download_data(data_url, data_md5, + "mobilenetv1_fp32") + + def test_calibration(self): + self.download_mobilenetv1_model() + fp32_acc1 = self.run_program(self.model_cache_folder + "/model") + self.run_program(self.model_cache_folder + "/model", True, algo='KL') int8_acc1 = self.run_program("calibration_out") delta_value = np.abs(fp32_acc1 - int8_acc1) self.assertLess(delta_value, 0.01) From 526790e652502a3299b079203ec1b69f5633334a Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Mon, 28 Jan 2019 14:35:31 +0800 Subject: [PATCH 121/123] infer get program (#15511) --- paddle/fluid/inference/api/analysis_predictor.cc | 4 ++++ paddle/fluid/inference/api/analysis_predictor.h | 2 ++ paddle/fluid/inference/api/analysis_predictor_tester.cc | 2 ++ paddle/fluid/inference/api/paddle_api.h | 8 ++++++++ 4 files changed, 16 insertions(+) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7d97aea714..3a5f21d475 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -726,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { return need; } +std::string AnalysisPredictor::GetSeriazlizedProgram() const { + return inference_program_->Proto()->SerializeAsString(); +} + template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnalysisConfig &config) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 921aa90952..fa1d0d596d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor { void SetMkldnnThreadID(int tid); + std::string GetSeriazlizedProgram() const override; + protected: // For memory optimization. bool need_collect_var_shapes_for_memory_optim(); diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 4688e93d71..20b61344da 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) { { // The first predictor help to cache the memory optimize strategy. auto predictor = CreatePaddlePredictor(config); + LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram(); + ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty()); // Run several times to check the parameters are not reused by mistake. for (int i = 0; i < 5; i++) { diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 46b510fd1e..4fc12c294a 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -215,6 +215,14 @@ class PaddlePredictor { */ virtual ~PaddlePredictor() = default; + /** \brief Get the serialized model program that executes in inference phase. + * Its data type is ProgramDesc, which is a protobuf message. + */ + virtual std::string GetSeriazlizedProgram() const { + assert(false); // Force raise error. + return "NotImplemented"; + }; + /** The common configs for all the predictors. */ struct Config { From b62b756b288a946db44695ef0049c7d4bd139a13 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Mon, 28 Jan 2019 14:46:00 +0800 Subject: [PATCH 122/123] add version support (#15469) --- paddle/fluid/framework/CMakeLists.txt | 22 +++++++++++++++++++++- paddle/fluid/framework/commit.h.in | 21 +++++++++++++++++++++ paddle/fluid/inference/api/api.cc | 10 ++++++++++ paddle/fluid/inference/api/api_tester.cc | 6 ++++++ paddle/fluid/inference/api/paddle_api.h | 2 ++ 5 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/framework/commit.h.in diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 2ba2437de6..66f11dedba 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,4 +1,3 @@ - #windows treat symbolic file as a real file, which is different with unix #We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) @@ -207,3 +206,24 @@ endif (NOT WIN32) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) + +# Get the current working branch +execute_process( + COMMAND git rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Get the latest abbreviated commit hash of the working branch +execute_process( + COMMAND git log -1 --format=%h + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +message(STATUS "commit: ${PADDLE_COMMIT}") +message(STATUS "branch: ${PADDLE_BRANCH}") + +configure_file(commit.h.in commit.h) diff --git a/paddle/fluid/framework/commit.h.in b/paddle/fluid/framework/commit.h.in new file mode 100644 index 0000000000..3a33ece624 --- /dev/null +++ b/paddle/fluid/framework/commit.h.in @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace paddle { +namespace framework { + +static std::string paddle_commit() { + return "@PADDLE_COMMIT@"; +} + +static std::string paddle_compile_branch() { + return "@PADDLE_BRANCH@"; +} + +static std::string paddle_version() { + return "@PADDLE_VERSION@"; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 9be059c73e..6cd18277d6 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/framework/commit.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -97,4 +99,12 @@ void PaddleBuf::Free() { } } +std::string get_version() { + std::stringstream ss; + ss << "version: " << framework::paddle_version() << "\n"; + ss << "commit: " << framework::paddle_commit() << "\n"; + ss << "branch: " << framework::paddle_compile_branch() << "\n"; + return ss.str(); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc index 7a579610ee..2c450ef7ce 100644 --- a/paddle/fluid/inference/api/api_tester.cc +++ b/paddle/fluid/inference/api/api_tester.cc @@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) { predictor->Run({}, &outputs); } +TEST(paddle_inference_api, get_version) { + LOG(INFO) << "paddle version:\n" << get_version(); + auto version = get_version(); + ASSERT_FALSE(version.empty()); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 4fc12c294a..4069832246 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -296,4 +296,6 @@ std::unique_ptr CreatePaddlePredictor(const ConfigT& config); int PaddleDtypeSize(PaddleDType dtype); +std::string get_version(); + } // namespace paddle From a6910f900e5683f70a9110d4b1a22f54e051c8e5 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 28 Jan 2019 15:26:22 +0800 Subject: [PATCH 123/123] Always create variables in analysis_predictor before OptimizeInferenceProgram. (#15533) Otherwise, some other persistable variable (like RAW type) will not be created --- .../fluid/inference/api/analysis_predictor.cc | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3a5f21d475..66374cb7f0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -123,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram( if (!program) { if (!LoadProgramDesc()) return false; + // If not cloned, the parameters should be loaded. + // If config_.ir_optim() is True, parameters is loaded in + // OptimizeInferenceProgram(), but other persistable variables + // (like RAW type var) are not created in scope. + // If config_.ir_optim() is False, parameters is loaded in LoadParameters(), + // still need to create other persistable variables. + // So in both case, create persistable variables at first. + executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); + // Optimize the program, and load parameters and modify them in the // scope_. // This will change the scope_ address. @@ -130,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram( status_ir_optim_enabled_ = true; OptimizeInferenceProgram(); } else { - // If the parent_scope is passed, we assert that the persistable variables - // are already created, so just create the no persistable variables. - - // If not cloned, the parameters should be loaded - // OptimizeInferenceProgram. - // So in both cases, just the local variables are needed to load, not the - // parematers. - executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); - // Load parameters LOG(INFO) << "load parameters "; LoadParameters(); @@ -376,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } argument_.SetIrAnalysisPasses(passes); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); - argument_.SetScopeNotOwned(const_cast(scope_.get())); + argument_.SetScopeNotOwned(scope_.get()); Analyzer().Run(&argument_); PADDLE_ENFORCE(argument_.scope_valid());