diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index c08e844847..4b0eff3adb 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -6,7 +6,10 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) + cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) +cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) + cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) @@ -51,10 +54,6 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) - -cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor) -cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place) - cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index bc0da55cda..8fd2906107 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -22,7 +22,6 @@ #include "paddle/framework/block_desc.h" #include "paddle/framework/op_registry.h" -#include "paddle/operators/dynamic_recurrent_op.h" #include "paddle/operators/net_op.h" namespace paddle { @@ -218,21 +217,6 @@ static std::unique_ptr BackwardRecursive( return false; }); - // process recurrent gradient op as a special operator. - if (forwardOp.Type() == "dynamic_recurrent") { - // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), - // or this will result in infinite loop. - const auto& rnnop = - *static_cast(&forwardOp); - auto rnn_grad_op = - static_cast(grad_op.get()); - const auto& stepnet_op = - *static_cast(&rnnop.rnn.GetStepUnit()); - // create stepnet's gradient op - rnn_grad_op->rnn.SetStepUnit( - BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id)); - } - if (net->ops_.empty()) { // Current no aux op is added to network return grad_op; } diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h index 7f8a51cc58..21bdfca111 100644 --- a/paddle/framework/lod_tensor.h +++ b/paddle/framework/lod_tensor.h @@ -24,6 +24,7 @@ #include #include "paddle/framework/ddim.h" #include "paddle/framework/tensor.h" +#include "paddle/framework/tensor_util.h" #include "paddle/platform/enforce.h" #include "paddle/platform/place.h" @@ -175,9 +176,9 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level, PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1); for (size_t ins = 0; ins < num_instances; ins++) { for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) { - tensor.Slice(elem, elem + 1) - .CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(), - platform::CPUDeviceContext()); + auto slice = tensor.Slice(elem, elem + 1); + CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(), + platform::CPUDeviceContext(), &slice); } } return tensor; diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 28d0fcf94e..6a0c5133c9 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -89,34 +89,6 @@ class Tensor { /*! The internal of two tensors share the same memory block. */ inline Tensor& ShareDataWith(const Tensor& src); - /** - * @brief Copy the content of external tensor to a new place. - * - * @param[in] src The external tensor. - * @param[in] dst_place The dst place. - * @param[in] ctx The device context contains device resources. - * - * @note CopyFrom supports CPU <-> GPU, GPU <-> GPU. - */ - // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647 - // Remove `CopyFrom` and `CopyFromVector` from Tensor interface - // and make them global functions - inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, - const platform::DeviceContext& ctx); - - /** - * @brief Copy the content of an external vector to a tensor. - * - * @param[in] src The external tensor. - * @param[in] ctx The device context contains device resources. - * - * * @note CopyFromVector assumes that the tensor has been resized - * before invoking. - */ - template - inline void CopyFromVector(const std::vector& src, - const platform::DeviceContext& ctx); - /** * @brief Return a sub-tensor of the given tensor. * @@ -141,7 +113,6 @@ class Tensor { size_t memory_size() const; - private: inline void check_memory_size() const; private: diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc deleted file mode 100644 index 6058f1b8b1..0000000000 --- a/paddle/framework/tensor_array.cc +++ /dev/null @@ -1,444 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - - - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/framework/tensor_array.h" - -#include -#include -#include - -#include "paddle/framework/eigen.h" - -namespace paddle { -namespace framework { - -namespace detail { - -/* - * Offer an iterator over the length-sorted lod-tensor's top level. The top - * level of a lod-tensor stores batch-size of sequences, each top-level sequence - * may contains several lower-level sequences, sort top-level lod by the numbers - * of lower-level sequences in descending order, so that during RNN's running, - * the batch-size will keep decreasing, the short sentences will end at the tail - * of each batch. - * - * Let's take a simple lod-tensor for example - * - * |(0) |(1) top-level has two instances - * ||| ||||| lower-level - * - * sort by lower-level's length - * - * |(1) |(0) - * ||||| ||| - * - * when RNN runs, it get 5 batches (equals the number of elements the longest - * sequence has) - * - * ||||| - * ||| - * - * the first three batches has two elements, the last two elements just has 1 - * element each. - */ -struct DynamicBatchUnpacker { - using value_type = float; - - DynamicBatchUnpacker(const LoDTensor& source, size_t level, - bool descend = true) - : source(&source), level(level) { - BuildLengthSortedMeta(descend); - } - - LoDTensor GetBatch(size_t index); - - std::vector meta; - - LoDTensor const* source; - size_t level; - - protected: - void BuildLengthSortedMeta(bool descend); -}; - -LoDTensor PackDynamicBatch(const std::vector& source, - const std::vector& meta, const LoD& lod, - size_t level); - -std::vector GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) { - // collect indice need to copy to the batch - std::vector indice; - for (const auto& seq : meta) { - size_t id = seq.begin + batch_id; - if (id >= seq.end) break; - indice.push_back(id); - } - return indice; -} - -} // namespace detail - -const LoDTensor& TensorArray::Read(size_t index) const { - PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index); - if (index >= size()) { - values_.resize(index + 1); - } - return values_[index]; -} - -void TensorArray::Write(size_t index, const LoDTensor& value) { - PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index); - - if (index >= size()) { - values_.resize(index + 1); - } - - values_[index].set_lod(value.lod()); - values_[index].Resize(value.dims()); - values_[index].mutable_data(value.place()); - values_[index].CopyFrom(value, value.place(), platform::CPUDeviceContext()); -} - -void TensorArray::WriteShared(size_t index, const LoDTensor& value) { - PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index); - if (index >= size()) { - values_.resize(index + 1); - } - - values_[index].set_lod(value.lod()); - values_[index].ShareDataWith(value); -} - -LoDTensor TensorArray::Pack(size_t level, const std::vector& meta, - const LoD& lod) const { - return detail::PackDynamicBatch(values_, meta, lod, level); -} - -DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level, - bool length_desend) { - detail::DynamicBatchUnpacker unpacker(source, level, - length_desend /*descend*/); - - // find max length of all the sequences - size_t max_length = 0; - for (const auto& seq : unpacker.meta) { - max_length = std::max(max_length, seq.end - seq.begin); - } - - // write batches to values - for (size_t batch_id = 0; batch_id < max_length; batch_id++) { - Write(batch_id, unpacker.GetBatch(batch_id)); - } - - PADDLE_ENFORCE(!unpacker.meta.empty()); - return unpacker.meta; -} - -LoDTensor TensorArray::LodPack(size_t level) const { - PADDLE_ENFORCE_GT(size(), 0UL, "no time step exists"); - // the levels should be no less than 2 - LoDTensor merged; - const LoDTensor *pre, *cur; - pre = &Read(0); - - for (size_t step = 1; step < size(); step++) { - cur = &Read(step); - PADDLE_ENFORCE_GT(cur->NumLevels(), 0); - PADDLE_ENFORCE_GT(pre->NumLevels(), 0); - PADDLE_ENFORCE_EQ(pre->NumLevels(), cur->NumLevels()); - PADDLE_ENFORCE_EQ(pre->NumElements(level), cur->NumElements(level)); - - merged = LodPackTwo(*pre, *cur, level); - pre = &merged; - } - return merged; -} - -/* - * NOTE currently, only the lowest level supports packing. - * The lowest LoD will be changed, while the relative offsets in levels above - * stay unchanged. - * - * previous step : [0] [1] [3] - * current step: [0 1 2] [2 3] [] - * packed to - * [0 0] [0 1] [0 2] [1 2] [1 3] [3] - */ -LoDTensor TensorArray::LodPackTwo(const LoDTensor& pre, const LoDTensor& cur, - size_t level) const { - PADDLE_ENFORCE_EQ(pre.NumLevels(), cur.NumLevels()); - PADDLE_ENFORCE_EQ(pre.NumLevels(), level + 1, - "Only the lowest LoD level supports pack temporarily."); - // calculate the result tensor's shape first - size_t num_instances = 0; - for (size_t elem = 0; elem < pre.NumElements(level); elem++) { - size_t prefix_size = pre.NumElements(level, elem); - size_t num_candidates = cur.NumElements(level, elem); - if (num_candidates > 0) { - num_instances += num_candidates * (prefix_size + 1); - } else { - num_instances += prefix_size; - } - } - - auto res_dims = pre.dims(); - res_dims[0] = num_instances; - LoDTensor result; - result.Resize(res_dims); - result.mutable_data(cur.place()); - - Vector last_lod_level; - // copy data - size_t index = 0; - last_lod_level.push_back(index); - for (size_t elem = 0; elem < pre.NumElements(level); elem++) { - size_t prefix_size = pre.NumElements(level, elem); - size_t num_candidates = cur.NumElements(level, elem); - - // slice the prefix Tensor - LoDTensor prefix = pre; - prefix.ShrinkInLevel(level, elem, elem + 1); - LoDTensor candidate = cur; - if (num_candidates > 0) { - candidate.ShrinkInLevel(level, elem, elem + 1); - } else { // just push prefix - result.Slice(index, index + prefix_size) - .CopyFrom(prefix, result.place(), platform::CPUDeviceContext()); - index += prefix_size; - last_lod_level.push_back(index); - } - for (size_t candi = 0; candi < num_candidates; candi++) { - // TODO(superjom) support GPU - result.Slice(index, index + prefix_size) - .CopyFrom(prefix, result.place(), platform::CPUDeviceContext()); - index += prefix_size; - // copy candidate record - result.Slice(index, index + 1) - .CopyFrom(candidate.Slice(candi, candi + 1), result.place(), - platform::CPUDeviceContext()); - index++; - last_lod_level.push_back(index); - } - } - - // update lod - auto lod = cur.lod(); - lod.back() = last_lod_level; - result.set_lod(lod); - return result; -} - -/* - * source [0 1 2] [3 4] [5 6 7] will be transformd to a list of LoDTensors such - * as - * [0 3 5] [1 4 6] [2 7] with 1-level LoDs: - * - [0 1 2 3] - * - [0 1 2 3] - * - [0 1 1 2], the [1,1) here means the second sequence is empty - * - * NOTE Unpack a LoDTensor in this approach may result in a big LoD. - */ -void TensorArray::LodUnpack(const LoDTensor& source, size_t level) { - PADDLE_ENFORCE_EQ(level, source.NumLevels() - 1, - "only the lowest LoD level supports unpack."); - const size_t non_empty_instances = source.dims()[0]; - size_t index = 0; - Vector lowest_lod_level; - lowest_lod_level.push_back(index); - - for (size_t step = 0; step < non_empty_instances; step++) { - size_t num_instances = 0; - for (size_t id = 0; id < source.NumElements(level); id++) { - auto instance = source; - instance.ShrinkInLevel(level, id, id + 1); - if (static_cast(instance.dims()[0]) > step) { - num_instances++; - index++; - } - lowest_lod_level.push_back(index); - } - - // create tensor for this time step - LoDTensor tensor; - auto dims = source.dims(); - dims[0] = num_instances; - // set lod - auto lod = source.lod(); - lod.back() = lowest_lod_level; - tensor.set_lod(lod); - - index = 0; - for (size_t id = 0; id < source.NumElements(level); id++) { - auto instance = source; - instance.ShrinkInLevel(level, id, id + 1); - if (static_cast(instance.dims()[0]) > step) { - // copy this instance - tensor.Slice(index, index + 1) - .CopyFrom(instance.Slice(step, step + 1), tensor.place(), - platform::CPUDeviceContext()); - index++; - } - } - Write(step, tensor); - } -} - -LoDTensor TensorArray::Stack() const { - LoDTensor result; - if (size() == 0) return result; - - const auto& first_dims = values_.front().dims(); - // check all the values have the same shape - // TODO(superjom) check the same data_type - for (size_t idx = 1; idx < size(); idx++) { - const auto& value_dims = values_[idx].dims(); - PADDLE_ENFORCE_EQ(first_dims, value_dims); - } - - // copy - auto result_dims = vectorize(first_dims); - result_dims.insert(result_dims.begin(), size()); - result.Resize(make_ddim(result_dims)); - result.mutable_data(platform::CPUPlace()); - - for (size_t idx = 0; idx < size(); idx++) { - result.Slice(idx, idx + 1) - .CopyFrom(Read(idx), platform::CPUPlace(), - platform::CPUDeviceContext()); - } - return result; -} - -void TensorArray::Unstack(const LoDTensor& source) const { - Unstack(source, false /*data_shared*/); -} - -void TensorArray::UnstackShared(const LoDTensor& source) const { - Unstack(source, true /*data_shared*/); -} - -void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const { - size_t first_dim = source.dims()[0]; - DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size()); - PADDLE_ENFORCE_GT(first_dim, 0, - "source should have some data to be unstacked"); - - values_.resize(first_dim); - - for (size_t elem = 0; elem < first_dim; elem++) { - // create a new value - auto& value = values_[elem]; - if (data_shared) { - // share memory - value.ShareDataWith(source.Slice(elem, elem + 1)); - } else { - // copy - value.Resize(value_dims); - value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(), - platform::CPUDeviceContext()); - } - } -} - -size_t TensorArray::size() const { return values_.size(); } - -namespace detail { - -void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) { - PADDLE_ENFORCE(meta.empty(), "duplicate build meta"); - // collect meta for each sequence in some level - auto lod = SliceLevels(source->lod(), level, level + 1)[0]; - - for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) { - DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id}); - meta.push_back(seq_meta); - } - - PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty"); - - // sort by length - sort(meta.begin(), meta.end(), - [descend](const DySeqMeta& a, const DySeqMeta& b) { - bool a_ge_b = (a.end - a.begin) > (b.end - b.begin); - return descend ? a_ge_b : !a_ge_b; - }); -} - -LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) { - PADDLE_ENFORCE(!meta.empty(), "should build meta first"); - LoDTensor result; - - auto indice = detail::GenDyBatchIndice(meta, index); - PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index); - - // copy the indice of records in LoDTensor - auto record_dims = slice_ddim(source->dims(), 1, source->dims().size()); - auto record_dims_vec = vectorize(record_dims); - record_dims_vec.insert(record_dims_vec.begin(), indice.size()); - result.Resize(make_ddim(record_dims_vec)); - result.mutable_data(platform::CPUPlace()); - - for (size_t i = 0; i < indice.size(); i++) { - auto index = indice[i]; - auto target = result.Slice(i, i + 1); - auto slice = source->Slice(index, index + 1); - - target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext()); - } - - return result; -} - -// TODO(supejom) to cache lod if reasonable -LoDTensor PackDynamicBatch(const std::vector& source, - const std::vector& meta, const LoD& lod, - size_t level) { - PADDLE_ENFORCE(!source.empty()); - PADDLE_ENFORCE(!meta.empty()); - PADDLE_ENFORCE(!lod.empty()); - - LoDTensor result; - - // init result space - auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size()); - auto record_dims_vec = vectorize(record_dims); - auto height = lod[level].back(); - record_dims_vec.insert(record_dims_vec.begin(), height); - result.Resize(make_ddim(record_dims_vec)); - result.mutable_data(platform::CPUPlace()); - - for (size_t batch_id = 0; batch_id < source.size(); batch_id++) { - for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) { - const auto& seq_meta = meta[seq_id]; - // source is source[batch_id][seq_id] - // target is result[index] - auto index = seq_meta.begin + batch_id; - if (index >= seq_meta.end) break; - auto source_ = source[batch_id].Slice(seq_id, seq_id + 1); - auto target = result.Slice(index, index + 1); - target.CopyFrom(source_, platform::CPUPlace(), - platform::CPUDeviceContext()); - } - } - - result.set_lod(lod); - return result; -} - -} // namespace detail - -} // namespace framework -} // namespace paddle diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h deleted file mode 100644 index 78fad8cab7..0000000000 --- a/paddle/framework/tensor_array.h +++ /dev/null @@ -1,132 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include - -#include "paddle/framework/lod_tensor.h" - -namespace paddle { -namespace framework { - -/* - * DyBatchSeqPosition stores indices of the basic element in tensor. It is used - * after lod-tensor's re-assembling, its info can be used to recover the order - * in original lod-tensor. - */ -struct DySeqMeta { - DySeqMeta(size_t begin, size_t end, size_t ori_idx) - : begin(begin), end(end), ori_idx(ori_idx) {} - - size_t begin; - size_t end; // not included - size_t ori_idx; -}; - -using DySeqMetaBatch = std::vector; - -/* - * Extract the indices of instances. - */ -std::vector GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id); - -/* - * TensorArray is a C-array-like array of tensors, it is meant to be used with - * dynamic iteration primitives such as while_loop. It is used to segment inputs - * and store states in all time steps. - * - * By providing some methods similar to a C++ array, the difinition of some - * state-based dynamic models such as RNN cound be more natural and highly - * flexible. - */ -class TensorArray { - public: - using value_type = float; - - // max number of values allowed to store. - const size_t MAX_SIZE{100000}; - - /* - * Read the value at location `index` in the `TensorArray`. - */ - const LoDTensor &Read(size_t index) const; - - /* - * Write value into the index of the TensorArray. - */ - void Write(size_t index, const LoDTensor &value); - - /* - * Write value into the index of the TensorArray, with memory shared. - */ - void WriteShared(size_t index, const LoDTensor &value); - - /* - * Recover the original LoD-arranged LoDTensor with the `values`, `level` and - * `indice_map`. - */ - LoDTensor Pack(size_t level, const DySeqMetaBatch &meta, - const LoD &lod) const; - - /* - * Split LoDTensor in some `level` and write the generated batches to - * `values`, if set `desend`, will sort by length in descending order else in - * ascending order. - */ - DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend); - - /* - * Pack an array of LoDTensors to a LoDTensor. - */ - LoDTensor LodPack(size_t level) const; - - /* - * Unpack a LoDTensor to an array of LoDTensors. - */ - void LodUnpack(const LoDTensor &source, size_t level); - - /* - * Pack the values into a tensor with rank one higher than each tensor in - * values. - */ - LoDTensor Stack() const; - - /* - * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors. - */ - void Unstack(const LoDTensor &source) const; - - /* - * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors, - * with memory of tensors shared. - */ - void UnstackShared(const LoDTensor &source) const; - - /* - * Return the number of values. - */ - size_t size() const; - - protected: - void Unstack(const LoDTensor &source, bool data_shared) const; - - LoDTensor LodPackTwo(const LoDTensor &pre, const LoDTensor &cur, - size_t level) const; - - private: - mutable std::vector values_; -}; // class TensorArray - -} // namespace framework -} // namespace paddle diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc deleted file mode 100644 index 83b52b442d..0000000000 --- a/paddle/framework/tensor_array_test.cc +++ /dev/null @@ -1,182 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/framework/tensor_array.h" - -#include - -namespace paddle { -namespace framework { - -class TensorArrayTester : public ::testing::Test { - protected: - void SetUp() override { - LoDTensor source; - source.Resize(make_ddim({batch_size, dim})); - int* data = source.mutable_data(platform::CPUPlace()); - for (int i = 0; i < 16 * 32; i++) { - data[i] = i; - } - ta.Unstack(source); - } - - TensorArray ta; - const int batch_size = 16; - const int dim = 32; -}; - -TEST_F(TensorArrayTester, Read) { - for (int i = 0; i < batch_size; i++) { - const auto& tensor = ta.Read(i); - ASSERT_EQ(tensor.dims()[0], 1); - ASSERT_EQ(tensor.dims()[1], dim); - } -} - -TEST_F(TensorArrayTester, Write) { - LoDTensor source; - source.Resize(make_ddim({1, dim})); - for (int i = 0; i < dim; i++) { - *(source.mutable_data(platform::CPUPlace()) + i) = i; - } - - ta.Write(2, source); - - const auto& tensor = ta.Read(2); - for (int i = 0; i < dim; i++) { - EXPECT_EQ(*(tensor.data() + i), *(source.data() + i)); - } -} - -TEST_F(TensorArrayTester, WriteShared) { - LoDTensor source; - source.Resize(make_ddim({1, dim})); - for (int i = 0; i < dim; i++) { - *(source.mutable_data(platform::CPUPlace()) + i) = i; - } - - ta.WriteShared(2, source); - - const auto& tensor = ta.Read(2); - for (int i = 0; i < dim; i++) { - EXPECT_EQ(*(tensor.data() + i), *(source.data() + i)); - } - - EXPECT_EQ(source.data(), tensor.data()); -} - -class TensorArrayPackTester : public ::testing::Test { - protected: - virtual void SetUp() override { - lod.push_back(std::vector{0, 2, 9, 13}); - - source.set_lod(lod); - source.Resize(make_ddim({13, 128})); - source.mutable_data(platform::CPUPlace()); - - // content of each setence: 0 1 2 3 4 - const auto& level = lod.front(); - for (size_t i = 0; i < level.size() - 1; i++) { - size_t begin = level[i]; - size_t end = level[i + 1]; - for (size_t j = begin; j < end; j++) { - auto record = source.Slice(j, j + 1); - for (int dim = 0; dim < 128; dim++) { - record.mutable_data(platform::CPUPlace())[dim] = j - begin; - } - } - } - - // unpack - meta = ta.Unpack(source, 0, true); - } - - LoD lod; - TensorArray ta; - LoDTensor source; - std::vector meta; -}; - -TEST_F(TensorArrayPackTester, Unpack) { - ASSERT_EQ(ta.size(), 7UL); - - const auto& t0 = ta.Read(0); - const auto& t1 = ta.Read(1); - - ASSERT_EQ(t0.data()[0], int(0)); - ASSERT_EQ(t1.data()[0], int(1)); -} - -TEST_F(TensorArrayPackTester, Pack) { - LoDTensor packed = ta.Pack(0, meta, lod); -} - -TEST_F(TensorArrayTester, size) { - ASSERT_EQ(ta.size(), static_cast(batch_size)); -} - -TEST(TensorArray, LodPack) { - // three time steps, each step stores a LoDTensors - // - [0] [1] - // - [2 3], [4 5] - // - [6 7] [] [8], [9, 10] - // try to get a LoDTensor with content: - // - [0 2 6] - // - [0 2 7] - // - [0 3] - // - [1 4 8] - // - [1 5 9] - // - [1 5 10] - std::array tensors; - tensors[0].Resize(make_ddim({2, 1})); - tensors[1].Resize(make_ddim({4, 1})); - tensors[2].Resize(make_ddim({5, 1})); - int index = 0; - for (auto& t : tensors) { - t.mutable_data(platform::CPUPlace()); - for (int i = 0; i < t.dims()[0]; i++) { - t.data()[i] = index; - index++; - } - } - - std::array lods; - std::vector> levels{ - {0, 1, 2}, {0, 2, 4}, {0, 2, 2, 3, 5}}; - for (int i = 0; i < 3; i++) { - lods[i].emplace_back(levels[i].begin(), levels[i].end()); - } - - TensorArray ta; - for (int i = 0; i < 3; i++) { - tensors[i].set_lod(lods[i]); - ta.Write(i, tensors[i]); - } - - auto merged = ta.LodPack(0); - - std::vector target_tensor_data{{0, 2, 6, // 0 - 0, 2, 7, // 1 - 0, 3, // 2 - 1, 4, 8, // 3 - 1, 5, 9, // 5 - 1, 5, 10}}; - EXPECT_EQ(merged.dims()[0], (int)target_tensor_data.size()); - for (size_t i = 0; i < target_tensor_data.size(); i++) { - EXPECT_EQ(target_tensor_data[i], merged.data()[i]); - } -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 7e88e03961..aba1f9f093 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -150,84 +150,6 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) { return *this; } -inline void Tensor::CopyFrom(const Tensor& src, - const platform::Place& dst_place, - const platform::DeviceContext& ctx) { - src.check_memory_size(); - Resize(src.dims()); - - auto src_place = src.holder_->place(); - auto src_ptr = src.data(); - - auto dst_ptr = mutable_data(dst_place, src.type()); - - auto size = src.numel() * SizeOfType(src.type()); - - if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { - memory::Copy(boost::get(dst_place), dst_ptr, - boost::get(src_place), src_ptr, size); - } -#ifdef PADDLE_WITH_CUDA - else if (platform::is_gpu_place(src_place) && - platform::is_cpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_cpu_place = boost::get(dst_place); - auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); - PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); - memory::Copy( - dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, - reinterpret_cast(ctx).stream()); - } else if (platform::is_cpu_place(src_place) && - platform::is_gpu_place(dst_place)) { - auto src_cpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); - auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); - PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); - memory::Copy( - dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, - reinterpret_cast(ctx).stream()); - } else if (platform::is_gpu_place(src_place) && - platform::is_gpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); - auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); - PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); - memory::Copy( - dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, - reinterpret_cast(ctx).stream()); - } -#endif -} - -template -inline void Tensor::CopyFromVector(const std::vector& src, - const platform::DeviceContext& ctx) { - auto dst_place = ctx.GetPlace(); - auto src_ptr = static_cast(src.data()); - platform::CPUPlace src_place; - auto dst_ptr = static_cast(mutable_data(dst_place)); - auto size = src.size() * sizeof(T); - - if (platform::is_cpu_place(dst_place)) { - memory::Copy(boost::get(dst_place), dst_ptr, src_place, - src_ptr, size); - } -#ifdef PADDLE_WITH_CUDA - else if (platform::is_gpu_place(dst_place)) { - memory::Copy( - boost::get(dst_place), dst_ptr, src_place, src_ptr, - size, - reinterpret_cast(ctx).stream()); - } -#endif -} - inline Tensor Tensor::Slice(int begin_idx, int end_idx) const { check_memory_size(); PADDLE_ENFORCE_GE(begin_idx, 0, diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index 1bb0fb71b0..ceca64365a 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -188,178 +188,6 @@ TEST(Tensor, Slice) { #endif } -TEST(Tensor, CopyFrom) { - using namespace paddle::framework; - using namespace paddle::platform; - { - Tensor src_tensor; - Tensor dst_tensor; - CPUDeviceContext cpu_ctx((CPUPlace())); - - int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); - - int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - memcpy(src_ptr, arr, 9 * sizeof(int)); - - auto cpu_place = new paddle::platform::CPUPlace(); - dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx); - - const int* dst_ptr = dst_tensor.data(); - ASSERT_NE(src_ptr, dst_ptr); - for (size_t i = 0; i < 9; ++i) { - EXPECT_EQ(src_ptr[i], dst_ptr[i]); - } - - Tensor slice_tensor = src_tensor.Slice(1, 2); - dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx); - const int* slice_ptr = slice_tensor.data(); - dst_ptr = dst_tensor.data(); - ASSERT_NE(dst_ptr, slice_ptr); - for (size_t i = 0; i < 3; ++i) { - EXPECT_EQ(dst_ptr[i], slice_ptr[i]); - } - } -#ifdef PADDLE_WITH_CUDA - { - Tensor src_tensor; - Tensor gpu_tensor; - Tensor dst_tensor; - - int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); - - int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - memcpy(src_ptr, arr, 9 * sizeof(int)); - - // CPU Tensor to GPU Tensor - auto gpu_place = new paddle::platform::GPUPlace(0); - CUDADeviceContext gpu_ctx(*gpu_place); - gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx); - - // GPU Tensor to CPU Tensor - auto cpu_place = new paddle::platform::CPUPlace(); - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); - - // Sync before Compare Tensors - gpu_ctx.Wait(); - const int* dst_ptr = dst_tensor.data(); - ASSERT_NE(src_ptr, dst_ptr); - for (size_t i = 0; i < 9; ++i) { - EXPECT_EQ(src_ptr[i], dst_ptr[i]); - } - - Tensor slice_tensor = src_tensor.Slice(1, 2); - - // CPU Slice Tensor to GPU Tensor - gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx); - - // GPU Tensor to CPU Tensor - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); - - // Sync before Compare Slice Tensors - gpu_ctx.Wait(); - const int* slice_ptr = slice_tensor.data(); - dst_ptr = dst_tensor.data(); - ASSERT_NE(dst_ptr, slice_ptr); - for (size_t i = 0; i < 3; ++i) { - EXPECT_EQ(dst_ptr[i], slice_ptr[i]); - } - } -#endif -} - -TEST(Tensor, CopyFromVector) { - using namespace paddle::framework; - using namespace paddle::platform; - { - std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - Tensor cpu_tensor; - - // Copy to CPU Tensor - cpu_tensor.Resize(make_ddim({3, 3})); - auto cpu_place = new paddle::platform::CPUPlace(); - CPUDeviceContext cpu_ctx(*cpu_place); - cpu_tensor.CopyFromVector(src_vec, cpu_ctx); - - // Compare Tensors - const int* cpu_ptr = cpu_tensor.data(); - const int* src_ptr = src_vec.data(); - ASSERT_NE(src_ptr, cpu_ptr); - for (size_t i = 0; i < 9; ++i) { - EXPECT_EQ(src_ptr[i], cpu_ptr[i]); - } - - src_vec.erase(src_vec.begin(), src_vec.begin() + 5); - cpu_tensor.Resize(make_ddim({2, 2})); - cpu_tensor.CopyFromVector(src_vec, cpu_ctx); - cpu_ptr = cpu_tensor.data(); - src_ptr = src_vec.data(); - ASSERT_NE(src_ptr, cpu_ptr); - for (size_t i = 0; i < 5; ++i) { - EXPECT_EQ(src_ptr[i], cpu_ptr[i]); - } - - delete cpu_place; - } - -#ifdef PADDLE_WITH_CUDA - { - std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - Tensor cpu_tensor; - Tensor gpu_tensor; - Tensor dst_tensor; - - // Copy to CPU Tensor - cpu_tensor.Resize(make_ddim({3, 3})); - auto cpu_place = new paddle::platform::CPUPlace(); - CPUDeviceContext cpu_ctx(*cpu_place); - cpu_tensor.CopyFromVector(src_vec, cpu_ctx); - - // Copy to GPUTensor - gpu_tensor.Resize(make_ddim({3, 3})); - auto gpu_place = new paddle::platform::GPUPlace(); - CUDADeviceContext gpu_ctx(*gpu_place); - gpu_tensor.CopyFromVector(src_vec, gpu_ctx); - // Copy from GPU to CPU tensor for comparison - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); - - // Sync before Compare Tensors - gpu_ctx.Wait(); - const int* src_ptr = src_vec.data(); - const int* cpu_ptr = cpu_tensor.data(); - const int* dst_ptr = dst_tensor.data(); - ASSERT_NE(src_ptr, cpu_ptr); - ASSERT_NE(src_ptr, dst_ptr); - for (size_t i = 0; i < 9; ++i) { - EXPECT_EQ(src_ptr[i], cpu_ptr[i]); - EXPECT_EQ(src_ptr[i], dst_ptr[i]); - } - - src_vec.erase(src_vec.begin(), src_vec.begin() + 5); - - cpu_tensor.Resize(make_ddim({2, 2})); - cpu_tensor.CopyFromVector(src_vec, cpu_ctx); - gpu_tensor.Resize(make_ddim({2, 2})); - gpu_tensor.CopyFromVector(src_vec, gpu_ctx); - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); - - // Sync before Compare Tensors - gpu_ctx.Wait(); - src_ptr = src_vec.data(); - cpu_ptr = cpu_tensor.data(); - dst_ptr = dst_tensor.data(); - ASSERT_NE(src_ptr, cpu_ptr); - ASSERT_NE(src_ptr, dst_ptr); - for (size_t i = 0; i < 5; ++i) { - EXPECT_EQ(src_ptr[i], cpu_ptr[i]); - EXPECT_EQ(src_ptr[i], dst_ptr[i]); - } - - delete cpu_place; - delete gpu_place; - } -#endif -} - TEST(Tensor, ReshapeToMatrix) { using namespace paddle::framework; using namespace paddle::platform; diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h new file mode 100644 index 0000000000..8ee2e15a59 --- /dev/null +++ b/paddle/framework/tensor_util.h @@ -0,0 +1,153 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "paddle/framework/tensor.h" + +namespace paddle { +namespace framework { + +/** + * @brief Copy the content of external tensor to a new place. + * + * @param[in] src The external tensor. + * @param[in] dst_place The dst place. + * @param[in] ctx The device context contains device resources. + * + * @note CopyFrom supports CPU <-> GPU, GPU <-> GPU. + */ + +inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, + const platform::DeviceContext& ctx, Tensor* dst) { + src.check_memory_size(); + + dst->Resize(src.dims()); + auto src_place = src.place(); + auto src_ptr = src.data(); + + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + + auto size = src.numel() * SizeOfType(src.type()); + + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + memory::Copy( + dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); + memory::Copy( + dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + memory::Copy( + dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +/** + * @brief Copy the content of an external vector to a tensor. + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains device resources. + * + * * @note CopyFromVector assumes that the tensor has been resized + * before invoking. + */ +template +inline void CopyFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst) { + auto dst_place = ctx.GetPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + if (platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, src_place, + src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(dst_place)) { // NOLINT + memory::Copy( + boost::get(dst_place), dst_ptr, src_place, src_ptr, + size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +/** + * @brief Copy the content of a tensor to a vector + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains device resources. + * + * * @note CopyFromVector assumes that the tensor has been resized + * before invoking. + */ +template +inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, + std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + if (platform::is_cpu_place(src.place())) { + memory::Copy(dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, boost::get(src.place()), src_ptr, + size, + reinterpret_cast(ctx).stream()); + } +#endif + +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc new file mode 100644 index 0000000000..03a70de182 --- /dev/null +++ b/paddle/framework/tensor_util_test.cc @@ -0,0 +1,228 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "paddle/framework/tensor_util.h" +#include +#include + +namespace paddle { +namespace framework { +TEST(CopyFrom, Tensor) { + Tensor src_tensor; + Tensor dst_tensor; + platform::CPUDeviceContext cpu_ctx((platform::CPUPlace())); + + int* src_ptr = + src_tensor.mutable_data(make_ddim({3, 3}), platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + auto cpu_place = new platform::CPUPlace(); + CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor); + + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + CopyFrom(slice_tensor, *cpu_place, cpu_ctx, &dst_tensor); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } +#ifdef PADDLE_WITH_CUDA + { + Tensor src_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + int* src_ptr = + src_tensor.mutable_data(make_ddim({3, 3}), platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + // CPU Tensor to GPU Tensor + auto gpu_place = new platform::GPUPlace(0); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + auto cpu_place = new platform::CPUPlace(); + CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + + // CPU Slice Tensor to GPU Tensor + CopyFrom(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Slice Tensors + gpu_ctx.Wait(); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + } +#endif +} + +TEST(CopyFromVector, Tensor) { + using namespace paddle::framework; + using namespace paddle::platform; + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor cpu_tensor; + + // Copy to CPU Tensor + cpu_tensor.Resize(make_ddim({3, 3})); + auto cpu_place = new paddle::platform::CPUPlace(); + CPUDeviceContext cpu_ctx(*cpu_place); + CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + + // Compare Tensors + const int* cpu_ptr = cpu_tensor.data(); + const int* src_ptr = src_vec.data(); + ASSERT_NE(src_ptr, cpu_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + } + + src_vec.erase(src_vec.begin(), src_vec.begin() + 5); + cpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + cpu_ptr = cpu_tensor.data(); + src_ptr = src_vec.data(); + ASSERT_NE(src_ptr, cpu_ptr); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + } + + delete cpu_place; + } + +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor cpu_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + // Copy to CPU Tensor + cpu_tensor.Resize(make_ddim({3, 3})); + auto cpu_place = new paddle::platform::CPUPlace(); + CPUDeviceContext cpu_ctx(*cpu_place); + CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + + // Copy to GPUTensor + gpu_tensor.Resize(make_ddim({3, 3})); + auto gpu_place = new paddle::platform::GPUPlace(); + CUDADeviceContext gpu_ctx(*gpu_place); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + // Copy from GPU to CPU tensor for comparison + CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* src_ptr = src_vec.data(); + const int* cpu_ptr = cpu_tensor.data(); + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, cpu_ptr); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + src_vec.erase(src_vec.begin(), src_vec.begin() + 5); + + cpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + gpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + src_ptr = src_vec.data(); + cpu_ptr = cpu_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, cpu_ptr); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + delete cpu_place; + delete gpu_place; + } +#endif +} + +TEST(CopyToVector, Tensor) { + using namespace paddle::framework; + using namespace paddle::platform; + { + Tensor src; + int* src_ptr = src.mutable_data({3, 3}, CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = i; + } + + CPUPlace place; + CPUDeviceContext cpu_ctx(place); + std::vector dst; + CopyToVector(src, cpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } + } +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor gpu_tensor; + GPUPlace place; + CUDADeviceContext gpu_ctx(place); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + CopyToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 059a6bba84..7ab09b6c65 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -178,7 +178,6 @@ set(DEPS_OPS cond_op cross_entropy_op recurrent_op - dynamic_recurrent_op softmax_with_cross_entropy_op softmax_op sequence_softmax_op @@ -225,13 +224,6 @@ op_library(sequence_pool_op DEPS sequence_pooling) op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) -if(WITH_TESTING) - op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc - DEPS net_op tensor_array gtest) -else() - op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc - DEPS net_op tensor_array) -endif() op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) @@ -246,9 +238,6 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) -cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc - rnn/recurrent_op_utils.cc - DEPS dynamic_recurrent_op) if(WITH_GPU) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 233a81198e..1f2b4fdb4b 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -36,7 +36,7 @@ class ArrayOp : public framework::OperatorBase { if (platform::is_gpu_place(i_tensor.place())) { // FIXME: Avoid copy from GPU to CPU framework::Tensor t; - t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx); + framework::CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx, &t); dev_ctx.Wait(); offset = static_cast(*t.data()); } else { diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc index c0903bb4e5..faeba7f3ed 100644 --- a/paddle/operators/array_to_lod_tensor_op.cc +++ b/paddle/operators/array_to_lod_tensor_op.cc @@ -102,8 +102,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { if (len == 0) { continue; } - out->Slice(out_offset, out_offset + len) - .CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx); + auto slice = out->Slice(out_offset, out_offset + len); + framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, + dev_ctx, &slice); out_offset += len; } } diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc index 609e915b93..0a37f18729 100644 --- a/paddle/operators/assign_op.cc +++ b/paddle/operators/assign_op.cc @@ -43,7 +43,8 @@ class AssignFunctor { out_rows.set_rows(rows.rows()); out_rows.set_height(rows.height()); auto &t = rows.value(); - out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_); + auto *m = out_rows.mutable_value(); + framework::CopyFrom(t, t.place(), dev_ctx_, m); } template @@ -55,7 +56,7 @@ class AssignFunctor { void copy_tensor(const framework::LoDTensor &lod_tensor, framework::LoDTensor *out) const { auto &out_tensor = *out; - out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_); + CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); out_tensor.set_lod(lod_tensor.lod()); } diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h index 0f007ec22f..3b1c6cd7a1 100644 --- a/paddle/operators/beam_search_decode_op.h +++ b/paddle/operators/beam_search_decode_op.h @@ -232,12 +232,12 @@ void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( id_tensor->set_lod(lod); id_tensor->Resize({static_cast(id_data.size())}); id_tensor->mutable_data(paddle::platform::CPUPlace()); - id_tensor->CopyFromVector(id_data, cpu_ctx); + framework::CopyFromVector(id_data, cpu_ctx, id_tensor); score_tensor->set_lod(lod); score_tensor->Resize({static_cast(score_data.size())}); score_tensor->mutable_data(paddle::platform::CPUPlace()); - score_tensor->CopyFromVector(score_data, cpu_ctx); + framework::CopyFromVector(score_data, cpu_ctx, score_tensor); } template diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc deleted file mode 100644 index d48cc4e8df..0000000000 --- a/paddle/operators/dynamic_recurrent_op.cc +++ /dev/null @@ -1,418 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve . - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/operators/dynamic_recurrent_op.h" - -#include "paddle/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using framework::Scope; -using framework::TensorArray; -using framework::LoDTensor; -using framework::Variable; -using framework::OperatorBase; -using framework::DySeqMetaBatch; - -namespace detail { - -inline void CreateVariables(Scope& scope, - const std::vector& var_names) { - for (const auto& name : var_names) { - scope.Var(name); - } -} - -/* - * The inputs with sequence should be reordered when they are split, so the - * boot_states should be reordered in the same order. - * - * NOTE This may require that the `pre_state` of the first time step should just - * copy the `boot_state` rather than reference it, for that the content should - * be reordered, but the RNN op should not change the `boot_state` as an input - * variable's content. - */ -inline void ReorderInitialState(const DySeqMetaBatch& metas, - const LoDTensor& boot_state, LoDTensor* tensor, - const platform::Place& dst_place) { - for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) { - auto slice = tensor->Slice(seq_id, seq_id + 1); - auto boot_slice = - boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); - // TODO(superjom) pass in device context as an argument - slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext()); - } -} - -inline void RestoreInitialState(const DySeqMetaBatch& metas, - const LoDTensor& tensor, LoDTensor* boot_state, - const platform::Place& dst_place) { - for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) { - auto slice = tensor.Slice(seq_id, seq_id + 1); - auto boot_slice = - boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); - boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext()); - } -} - -} // namespace detail - -// Implementation for forward propagation. -template <> -void RNNAlgorithm::Run( - const framework::Scope& scope, const framework::OperatorBase& op, - const platform::DeviceContext& dev_ctx) { - SetComputeMode(ComputeMode::kForward); - cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_); - SplitInputs(); - CreateScopes(); - WriteStepInputs(); - InitStates(); - WriteStepOutputs(); - RunSteps(); - ConcatOutputs(); -} - -// Implementation for backward propagation. -template <> -void RNNAlgorithm::Run( - const framework::Scope& scope, const framework::OperatorBase& op, - const platform::DeviceContext& dev_ctx) { - SetComputeMode(ComputeMode::kBackward); - cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_); - SplitInputs(); - WriteStepInputs(); - InitStates(); - WriteStepOutputs(); - RunSteps(); - // copy boot-states' gradients back. - for (const auto& state : arg_.states) { - ExportInitialStateGradient(state); - } - - ConcatOutputs(); -} - -void RNNAlgorithm::SplitInputs() { - // TODO(superjom) make level a config - // TODO(superjom) check all the inputs has the same LoD - int level = 0; - for (const auto& item : cache_.inputs) { - const auto& var = item.second; - const auto& tensor = var->Get(); - TensorArray& ta = step_inputs_[item.first]; - - dy_seq_metas_[item.first] = - ta.Unpack(tensor, level, true /*length_descend*/); - - if (cache_.num_steps) { - PADDLE_ENFORCE_EQ(ta.size(), cache_.num_steps, - "inputs should have the same steps"); - } else { - cache_.num_steps = ta.size(); - } - } -} - -void RNNAlgorithm::WriteStepInputs() { - for (const auto& item : cache_.inputs) { - auto ta_it = step_inputs_.find(item.first); - PADDLE_ENFORCE(ta_it != step_inputs_.end(), - "step_inputs_ not compatible with memory set"); - TensorArray& ta = ta_it->second; - for (size_t step = 0; step < ta.size(); step++) { - auto tensor = ta.Read(step); - auto& step_scope = cache_.GetScope(step); - Variable* var = step_scope.FindVar(item.first); - if (var == nullptr) { - var = step_scope.Var(item.first); - } - var->GetMutable()->ShareDataWith(tensor); - } - } -} - -void RNNAlgorithm::WriteStepOutputs() { - // initialize step outputs - for (const auto& item : cache_.outputs) { - step_outputs_.emplace(item.first, TensorArray()); - } - PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL); -} - -void RNNAlgorithm::CreateScopes() { - PADDLE_ENFORCE_GT(cache_.num_steps, 0); - // resize scopes - size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size(); - for (size_t i = 0; i < num_scopes_need_create; i++) { - cache_.scopes->emplace_back(&cache_.scope->NewScope()); - } - - // init temporary inputs - PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first"); - std::vector states; - std::vector ex_states; - std::vector step_unit_outputs; - std::transform(arg_.states.begin(), arg_.states.end(), - std::back_inserter(states), - [](const rnn::StateAttr& m) { return m.var; }); - std::transform(arg_.states.begin(), arg_.states.end(), - std::back_inserter(ex_states), - [](const rnn::StateAttr& m) { return m.pre_var; }); - for (const auto& item : step_unit_->Outputs()) { - for (const auto& var : item.second) { - step_unit_outputs.push_back(var); - } - } - - for (size_t step = 0; step < cache_.num_steps; step++) { - auto& scope = cache_.GetScope(step); - detail::CreateVariables(scope, arg_.inlinks); - detail::CreateVariables(scope, arg_.outlinks); - detail::CreateVariables(scope, states); - detail::CreateVariables(scope, ex_states); - detail::CreateVariables(scope, step_unit_outputs); - } -} - -void RNNAlgorithm::ConcatOutputs() { - // TODO(superjom) transform this to a config - int level = 0; - for (size_t step = 0; step < cache_.num_steps; step++) { - auto& scope = cache_.GetScope(step); - for (auto& item : step_outputs_) { - auto* var = scope.FindVar(item.first); - PADDLE_ENFORCE_NOT_NULL(var); - auto* tensor = var->GetMutable(); - tensor->mutable_data(platform::CPUPlace()); - item.second.WriteShared(step, *tensor); - } - } - // the inputs' lods should be the same, so randomly get one lod. - const auto& some_lod = - cache_.scope->FindVar(arg_.inlinks.front())->Get().lod(); - const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; - for (auto& item : step_outputs_) { - auto tensor = item.second.Pack(level, some_meta, some_lod); - auto* output = cache_.outputs[item.first]->GetMutable(); - const_cast(output)->ShareDataWith(tensor); - } -} - -void RNNAlgorithm::RunSteps() { - if (IsBackward()) { - // call stepnet in all the time steps reversely - for (int step = cache_.num_steps - 1; step >= 0; step--) { - auto& step_scope = cache_.GetScope(step); - step_unit_->Run(step_scope, *cache_.dev_ctx); - } - } else { - for (size_t step = 0; step < cache_.num_steps; step++) { - auto& step_scope = cache_.GetScope(step); - step_unit_->Run(step_scope, *cache_.dev_ctx); - } - } -} - -void RNNAlgorithm::InitStates() { - for (size_t step = 0; step < cache_.num_steps; step++) { - for (const auto& state : arg_.states) { - CreateState(state, step); - LinkState(state, step); - } - } -} - -void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) { - auto& scope = cache_.GetScope(step); - auto& state = *cache_.GetTensor(scope, state_attr.var); - auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var); - - size_t num_instances = - step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; - auto dims = boot_state.dims(); - dims[0] = num_instances; - - state.Resize(dims); - state.mutable_data(platform::CPUPlace()); - states_[state_attr.var].WriteShared(step, state); -} - -void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) { - auto& scope = cache_.GetScope(step); - auto& state_pre = *cache_.GetTensor(scope, state.pre_var); - - // process the first state's boot-state(the 0-step in forward mode or the - // last step in backward mode) - // Only forward mode need to link the boot-state to the `pre-state` in first - // time step. In backward mode, need to copy the gradient of `pre-state` in - // first time step to the gradient of `boot-state`. - if (step == 0 && IsForward()) { - LinkInitialState(state); - } else { - size_t num_instances = - step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; - auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var); - // shink and share from previous state - auto shrinked_pre_state = pre_state->Slice(0, num_instances); - state_pre.ShareDataWith(shrinked_pre_state); - } -} - -void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) { - // all the step_inputs' metas should be the same, just randomly select one - // and get the dyseq meta. - const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; - auto& scope = cache_.GetScope(0); - auto& state_pre = *cache_.GetTensor(scope, state.pre_var); - auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var); - pre_state->mutable_data(platform::CPUPlace()); - // allocate state - state_pre.Resize(pre_state->dims()); - state_pre.mutable_data(platform::CPUPlace()); - detail::ReorderInitialState(some_meta, *pre_state, &state_pre, - pre_state->place()); -} - -void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) { - // all the step_inputs' metas should be the same, just randomly select one - // and get the dyseq meta. - const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; - auto& scope = cache_.GetScope(0); - - auto& state_pre = *cache_.GetTensor(scope, state.pre_var); - auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var); - pre_state.Resize(state_pre.dims()); - detail::RestoreInitialState(some_meta, state_pre, &pre_state, - pre_state.place()); -} - -void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name, - const paddle::framework::OperatorBase& op, - const paddle::framework::Scope& scope, - platform::DeviceContext const* dev_ctx, - rnn::Argument* arg) { - this->scope = &scope; - InitArgument(name, op, arg); - CacheScopes(scope, *arg); - CacheInlinks(scope, arg->inlinks); - CacheOutlinks(scope, arg->outlinks); - this->dev_ctx = dev_ctx; -} - -void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name, - const OperatorBase& op, - rnn::Argument* arg) { - rnn::InitArgument(name, arg, op, false /*is_grad*/); -} - -void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope, - const rnn::Argument& arg) { - auto scopes_var = scope.FindVar(arg.step_scopes); - PADDLE_ENFORCE(scopes_var != nullptr, - "the step_scopes output argument [%s] should be created first " - "by framework.", - arg.step_scopes); - this->scopes = scopes_var->GetMutable>(); -} - -void RNNAlgorithm::ArgCache::CacheInlinks( - const Scope& scope, const std::vector& names) { - for (auto name : names) { - auto* var = GetVariable(scope, name); - inputs[name] = var; - } -} - -void RNNAlgorithm::ArgCache::CacheOutlinks( - const Scope& scope, const std::vector& names) { - for (auto name : names) { - auto* var = GetVariable(scope, name); - outputs[name] = var; - } -} - -Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope, - const std::string& name) { - auto* var = scope.FindVar(name); - PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name); - return var; -} - -LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope, - const std::string& name) { - auto* var = GetVariable(scope, name); - return var->GetMutable(); -} - -const std::array RNNAlgorithm::kArgNames{ - {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs", - "states", "ex_states", "initial_states"}, - rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD", - "inputs@GRAD", "states", "ex_states", - "initial_states@GRAD"}}}; - -void DynamicRecurrentOp::Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const { - rnn.Run( - scope, *dynamic_cast(this), dev_ctx); -} - -void DynamicRecurrentGradientOp::Run( - const Scope& scope, const platform::DeviceContext& dev_ctx) const { - rnn.Run( - scope, *dynamic_cast(this), dev_ctx); -} - -class DynamicRecurrentOpProtoAndCheckerMaker - : public framework::OpProtoAndCheckerMaker { - public: - DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto, - framework::OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - const auto& name = - RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward]; - // inputs and outputs stored in proto - AddInput(name.inlinks, - "The inputs that need to be segmented for each step.") - .AsDuplicable(); - AddInput(name.initial_states, "Variables to initialize the states.") - .AsDuplicable(); - - AddOutput(name.outlinks, - "The outputs that need to be concatenated for all steps.") - .AsDuplicable(); - AddOutput(name.step_scopes, "step scopes"); - - // Attributes stored in AttributeMap - AddAttr>(name.ex_states, "names of ex_states"); - AddAttr>(name.states, "names of states"); - - AddComment(R"DOC( -Dynamic Recurrent Operator. - -This is a RNN operator for varience-length sequences. - -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp, - paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker, - dynamic_recurrent_grad, - paddle::operators::DynamicRecurrentGradientOp); diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h deleted file mode 100644 index 5b0548c3a4..0000000000 --- a/paddle/operators/dynamic_recurrent_op.h +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once - -#ifdef PADDLE_WITH_TESTING -#include "gtest/gtest.h" -#endif - -#include "paddle/framework/lod_tensor.h" -#include "paddle/framework/operator.h" -#include "paddle/framework/tensor_array.h" -#include "paddle/framework/variable.h" -#include "paddle/operators/rnn/recurrent_op_utils.h" - -namespace paddle { -namespace operators { - -class RNNAlgorithm { - public: - enum ComputeMode { kForward = 0, kBackward = 1 }; - static const std::array kArgNames; - using value_type = float; - - /* - * Different `Run` method for forward and backward, `_` is just for template - * specifialization. - */ - template - void Run(const framework::Scope& scope, const framework::OperatorBase& op, - const platform::DeviceContext& dev_ctx); - /* - * Split the inputs(LoDTensors) to segments for each time step. - */ - void SplitInputs(); - - /* - * Create step-scopes to store temporary outputs in each time steps. - */ - void CreateScopes(); - - /* - * Link TensorArray steps to the corresponding variables located in - * step-scopes. - */ - void WriteStepInputs(); - - /* - * Write output of each step to the corresponding TensorArray. - */ - void WriteStepOutputs(); - - /* - * Initialize the states, each state will have a corresponding pre-state, - * which share the memory with the state in the previous time state. The - * pre-state in the first time step will be initialized with an zero tensor or - * a tensor in parent scope if is provided. - */ - void InitStates(); - - /* - * Create state variables for each time step. - */ - void CreateState(const rnn::StateAttr& state, size_t step); - - /* - * Link pre-state variable in current scope to the state variable in the - * previous time step (scope) by reference. - */ - void LinkState(const rnn::StateAttr& state, size_t step); - - /* - * Link the pre-state of the first time step to the `boot-state` in parent's - * scope. - */ - void LinkInitialState(const rnn::StateAttr& state); - - /* - * Copy the gradient from `pre-state` in the first step-scope to the - * `boot-state` in parent's scope. - */ - void ExportInitialStateGradient(const rnn::StateAttr& state); - - /* - * Calculate time steps. - */ - void RunSteps(); - - /* - * Concatenate outputs in each time step and generate a LoDTensor. - */ - void ConcatOutputs(); - - void SetComputeMode(ComputeMode mode) { mode_ = mode; } - bool IsForward() const { return mode_ == ComputeMode::kForward; } - bool IsBackward() const { return mode_ == ComputeMode::kBackward; } - - /* - * set a step unit that is created according to a RecurrentOp's step unit. - */ - void SetStepUnit(std::unique_ptr step_unit) { - PADDLE_ENFORCE_NOT_NULL(step_unit); - step_unit_ = std::move(step_unit); - } - const framework::OperatorBase& GetStepUnit() const { return *step_unit_; } - - const framework::TensorArray& state(const std::string& name) const { - auto it = states_.find(name); - PADDLE_ENFORCE(it != states_.end()); - return it->second; - } - const framework::TensorArray& step_input(const std::string& name) const { - auto it = step_inputs_.find(name); - PADDLE_ENFORCE(it != step_inputs_.end()); - return it->second; - } - const framework::TensorArray& step_output(const std::string& name) const { - auto it = step_outputs_.find(name); - PADDLE_ENFORCE(it != step_outputs_.end()); - return it->second; - } - - protected: - struct ArgCache { - framework::Scope const* scope; - std::vector* scopes; - std::map inputs; - std::map outputs; - platform::DeviceContext const* dev_ctx; - - size_t num_steps{0}; - - void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op, - const framework::Scope& scope, - platform::DeviceContext const* dev_ctx, rnn::Argument* arg); - - framework::Scope& GetScope(size_t index) { - PADDLE_ENFORCE_LT(index, num_steps); - return *scopes->at(index); - } - - framework::LoDTensor* GetTensor(const framework::Scope& scope, - const std::string& name); - - private: - void InitArgument(const rnn::ArgumentName& name, - const framework::OperatorBase& op, rnn::Argument* arg); - void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg); - void CacheInlinks(const framework::Scope& scope, - const std::vector& names); - void CacheOutlinks(const framework::Scope& scope, - const std::vector& names); - framework::Variable* GetVariable(const framework::Scope& scope, - const std::string& name); - }; - - private: - std::unique_ptr step_unit_; - std::map states_; - std::map step_inputs_; - std::map step_outputs_; - std::map> dy_seq_metas_; - rnn::Argument arg_; - ArgCache cache_; - ComputeMode mode_{ComputeMode::kForward}; - -#ifdef PADDLE_WITH_TESTING - // test forward - friend class RNNAlgorithmTestHelper; - FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs); - FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache); - FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes); - FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs); - FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs); - FRIEND_TEST(RNNAlgorithmTestHelper, InitStates); - FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs); -// TODO(superjom) test backward -#endif -}; - -class DynamicRecurrentOp : public framework::OperatorBase { - public: - DynamicRecurrentOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - DynamicRecurrentOp(const DynamicRecurrentOp& o) - : framework::OperatorBase( - static_cast(o)) { - PADDLE_THROW("Not implemented"); - } - - void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override; - - mutable RNNAlgorithm rnn; -}; - -class DynamicRecurrentGradientOp : public framework::OperatorBase { - public: - DynamicRecurrentGradientOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o) - : framework::OperatorBase( - static_cast(o)) { - PADDLE_THROW("Not implemented"); - } - - void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override; - - mutable RNNAlgorithm rnn; -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc deleted file mode 100644 index 8d840e259b..0000000000 --- a/paddle/operators/dynamic_recurrent_op_test.cc +++ /dev/null @@ -1,217 +0,0 @@ -#include "paddle/operators/dynamic_recurrent_op.h" - -#include - -#include "paddle/framework/ddim.h" -#include "paddle/framework/lod_tensor.h" -#include "paddle/framework/op_desc.h" -#include "paddle/framework/op_registry.h" -#include "paddle/operators/net_op.h" - -namespace paddle { -namespace operators { - -using framework::Scope; -using framework::TensorArray; -using framework::LoDTensor; -using framework::Variable; - -class TestOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - DEFINE_OP_CLONE_METHOD(TestOp); - void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const override {} -}; - -void OpDescNewVar(const std::string& param_name, - std::initializer_list arguments, - paddle::framework::OpDesc::Var* var) { - var->set_parameter(param_name); - for (auto& arg_name : arguments) { - var->add_arguments(arg_name); - } -} - -// create a LoD tensor in scope with specific dims -LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims, - const platform::Place& place) { - auto* var = scope.Var(name); - auto* tensor = var->GetMutable(); - tensor->Resize(dims); - tensor->mutable_data(place); - return tensor; -} - -class RNNAlgorithmTestHelper : public ::testing::Test { - protected: - const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0]; - - virtual void SetUp() override { - CreateGlobalVariables(); - - auto op_desc = CreateOpDesc(); - op = paddle::framework::OpRegistry::CreateOp(op_desc); - dop = &(dynamic_cast(op.get())->rnn); - InitCacheManually(); - InitStepNet(); - } - - framework::OpDesc CreateOpDesc() { - // create op - paddle::framework::OpDesc op_desc; - op_desc.set_type("dynamic_recurrent"); - - OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs()); - OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs()); - OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs()); - OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs()); - - // set pre-states - auto pre_memories = op_desc.mutable_attrs()->Add(); - pre_memories->set_name(argname.ex_states); - pre_memories->set_type(paddle::framework::AttrType::STRINGS); - auto pre_memories_item = pre_memories->add_strings(); - *pre_memories_item = "mem@pre"; - - // set states - auto memories = op_desc.mutable_attrs()->Add(); - memories->set_name(argname.states); - memories->set_type(paddle::framework::AttrType::STRINGS); - auto memories_item = memories->add_strings(); - *memories_item = "mem"; - return op_desc; - } - - void CreateGlobalVariables() { - platform::CPUPlace place; - scope.Var("step_scopes"); - CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place); - CreateVar(scope, "out0", framework::make_ddim({10, 20}), place); - auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place); - // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively. - framework::LoD in0_lod(1); - for (int x : std::vector{0, 4, 7, 9, 10}) { - in0_lod[0].push_back(x); - } - in0->set_lod(in0_lod); - in0->Resize(framework::make_ddim({10, 8})); - // set the content, each sentence content is seqid.batchid - // the seqid starts from 0 - int start = 0; - for (size_t seqid = 0; seqid < in0_lod.size() - 1; seqid++) { - for (size_t batchid = 0; - batchid < in0_lod[0][seqid + 1] - in0_lod[0][seqid]; batchid++) { - float v = seqid + batchid * 0.1; - - for (size_t dim = 0; dim < 8; dim++) { - in0->data()[start * 8 + dim] = v; - } - start++; - } - } - } - - void InitCacheManually() { - dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context, - &dop->arg_); - } - - void InitStepNet() { - std::unique_ptr stepnet{new NetOp}; - dynamic_cast(stepnet.get()) - ->AppendOp(std::unique_ptr(new TestOp( - "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}}, - {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {}))); - dop->SetStepUnit(std::move(stepnet)); - } - - protected: - RNNAlgorithm* dop; - std::unique_ptr op; - paddle::platform::CPUDeviceContext device_context; - paddle::framework::Scope scope; -}; - -TEST_F(RNNAlgorithmTestHelper, CreateCache) { - const rnn::Argument& arg = dop->arg_; - ASSERT_EQ(arg.inlinks.size(), 1UL); - ASSERT_EQ(arg.outlinks.size(), 1UL); -} - -TEST_F(RNNAlgorithmTestHelper, SplitInputs) { - dop->SplitInputs(); - auto& in0_ta = dop->step_inputs_["in0"]; - ASSERT_EQ(in0_ta.size(), 4UL); - - const auto& batch0 = in0_ta.Read(0); - const auto& batch1 = in0_ta.Read(1); - const auto& batch2 = in0_ta.Read(2); - const auto& batch3 = in0_ta.Read(3); - EXPECT_EQ(batch0.dims()[0], 4); - EXPECT_EQ(batch1.dims()[0], 3); - EXPECT_EQ(batch2.dims()[0], 2); - EXPECT_EQ(batch3.dims()[0], 1); -} - -TEST_F(RNNAlgorithmTestHelper, CreateScopes) { - dop->SplitInputs(); - dop->CreateScopes(); - ASSERT_EQ(dop->cache_.num_steps, 4UL); - ASSERT_EQ(dop->cache_.scopes->size(), 4UL); -} - -TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) { - dop->SplitInputs(); - dop->CreateScopes(); - dop->WriteStepInputs(); - - for (size_t step = 0; step < dop->cache_.num_steps; step++) { - auto& scope = dop->cache_.GetScope(step); - for (auto name : std::vector({"in0"})) { - ASSERT_TRUE(scope.FindVar(name) != nullptr); - } - } -} - -TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) { - dop->SplitInputs(); - dop->CreateScopes(); - dop->WriteStepInputs(); - dop->WriteStepOutputs(); - - for (size_t step = 0; step < dop->cache_.num_steps; step++) { - auto& scope = dop->cache_.GetScope(step); - for (auto name : std::vector({"out0"})) { - ASSERT_TRUE(scope.FindVar(name)); - } - } -} - -TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) { - // Let's leave this test to python unittest. -} - -TEST_F(RNNAlgorithmTestHelper, InitStates) { - dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward); - dop->SplitInputs(); - dop->CreateScopes(); - dop->WriteStepInputs(); - dop->WriteStepOutputs(); - dop->InitStates(); - - for (size_t step = 0; step < dop->cache_.num_steps; step++) { - auto& scope = dop->cache_.GetScope(step); - auto state = scope.FindVar("mem"); - ASSERT_TRUE(state != nullptr); - - auto* pre_state = scope.FindVar("mem@pre"); - ASSERT_TRUE(pre_state != nullptr); - - auto* boot_state = scope.FindVar("boot_mem"); - ASSERT_TRUE(boot_state != nullptr); - } -} - -} // operators -} // namespace paddle diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h index 8ae2c11a5d..4d7996ad1e 100644 --- a/paddle/operators/expand_op.h +++ b/paddle/operators/expand_op.h @@ -125,7 +125,8 @@ class ExpandGradKernel : public framework::OpKernel { auto* in0 = context.Input(framework::GradVarName("Out")); auto* out0 = context.Output(framework::GradVarName("X")); out0->mutable_data(context.GetPlace()); - out0->CopyFrom(*in0, context.GetPlace(), context.device_context()); + framework::CopyFrom(*in0, context.GetPlace(), context.device_context(), + out0); } else { switch (dims) { REP_EXPAND_GRAD_TEMPLATE(72) diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index 0dd84cbeaa..ee43c22fb1 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -47,7 +47,7 @@ class FeedOp : public framework::OperatorBase { auto &feed_list = feed_var->Get(); auto &feed_item = feed_list.at(static_cast(col)); auto *out_item = out_var->GetMutable(); - out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx); + framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item); out_item->set_lod(feed_item.lod()); } }; diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 8108ae69de..1ae07194c2 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -51,7 +51,7 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx); + CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); dev_ctx.Wait(); dst_item.set_lod(src_item.lod()); diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h index 050430d325..3398c0934e 100644 --- a/paddle/operators/gru_unit_op.h +++ b/paddle/operators/gru_unit_op.h @@ -28,6 +28,10 @@ template using EigenMatrix = framework::EigenMatrix; +template +using EigenVector = framework::EigenVector; + enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; template @@ -226,7 +230,7 @@ class GRUUnitGradKernel : public framework::OpKernel { // backward for bias if (bias_grad) { bias_grad->mutable_data(context.GetPlace()); - auto d_b = EigenMatrix::From(*bias_grad); + auto d_b = EigenVector::Flatten(*bias_grad); d_b.device(place) = d_g.sum(Eigen::array({{0}})); } } diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index 872f659fed..014bbfa758 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -195,7 +195,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel { auto copyLoDTensor = [](const platform::DeviceContext& ctx, const LoDTensor& src, LoDTensor* dst) { dst->mutable_data(src.dims(), platform::CPUPlace()); - dst->CopyFrom(src, platform::CPUPlace(), ctx); + framework::CopyFrom(src, platform::CPUPlace(), ctx, dst); }; copyLoDTensor(ctx, emission_weights_src, emission_weights_dst); @@ -203,8 +203,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel { transition_weights_dst->mutable_data(transition_weights_src.dims(), platform::CPUPlace()); - transition_weights_dst->CopyFrom(transition_weights_src, - platform::CPUPlace(), ctx); + framework::CopyFrom(transition_weights_src, platform::CPUPlace(), ctx, + transition_weights_dst); } void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx, @@ -219,7 +219,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel { auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, Tensor* dst) { dst->mutable_data(platform::GPUPlace()); - dst->CopyFrom(src, platform::GPUPlace(), ctx); + framework::CopyFrom(src, platform::GPUPlace(), ctx, dst); }; copyTensor(ctx, emission_exps_src, emission_exps_dst); copyTensor(ctx, transition_exps_src, transition_exps_dst); @@ -410,12 +410,12 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { // Copy the inputs from GPU memory to CPU memory when this operators runs on // GPU device. label_dst->mutable_data(label_src.dims(), platform::CPUPlace()); - label_dst->CopyFrom(label_src, platform::CPUPlace(), ctx); + framework::CopyFrom(label_src, platform::CPUPlace(), ctx, label_dst); auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, Tensor* dst) { dst->mutable_data(src.dims(), platform::CPUPlace()); - dst->CopyFrom(src, platform::CPUPlace(), ctx); + framework::CopyFrom(src, platform::CPUPlace(), ctx, dst); }; copyTensor(ctx, emission_exps_src, emission_exps_dst); copyTensor(ctx, transition_exps_src, transition_exps_dst); @@ -434,7 +434,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { Tensor* dst) { if (src && dst) { dst->mutable_data(platform::GPUPlace()); - dst->CopyFrom(*src, platform::GPUPlace(), ctx); + framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst); } }; copyTensor(ctx, emission_grad_src, emission_grad_dst); diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index b71a33a6b1..b0838eed16 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -105,7 +105,7 @@ class LoadOp : public framework::OperatorBase { out_var->Clear(); tensor = out_var->GetMutable(); tensor->set_lod(cpu_tensor.lod()); - tensor->CopyFrom(cpu_tensor, place, dev_ctx); + CopyFrom(cpu_tensor, place, dev_ctx, tensor); } } }; diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h index 2bb916ccee..cbcbf80adc 100644 --- a/paddle/operators/lod_reset_op.h +++ b/paddle/operators/lod_reset_op.h @@ -33,7 +33,8 @@ class LoDResetKernel : public framework::OpKernel { auto* lod = lod_t->data(); if (platform::is_gpu_place(ctx.GetPlace())) { framework::Tensor lod_cpu; - lod_cpu.CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context()); + framework::CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context(), + &lod_cpu); lod = lod_cpu.data(); } level0 = std::vector(lod, lod + lod_t->numel()); diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index 58af35564d..010c79d4e1 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -81,11 +81,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase { continue; } // out[i][offset: offset+len] = x[each_range.begin: each_range.end] - out[i] - .Slice(static_cast(offset), static_cast(offset + len)) - .CopyFrom(x.Slice(static_cast(each_range.begin), - static_cast(each_range.end)), - x.place(), dev_ctx); + auto slice = out[i].Slice(static_cast(offset), + static_cast(offset + len)); + framework::CopyFrom(x.Slice(static_cast(each_range.begin), + static_cast(each_range.end)), + x.place(), dev_ctx, &slice); offset += len; } } diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h index 72f4202bac..d853507188 100644 --- a/paddle/operators/math/context_project.h +++ b/paddle/operators/math/context_project.h @@ -149,7 +149,7 @@ class ContextProjectFunctor { Tensor out_t_sub = out_t.Slice(k * context_length, k * context_length + padding_size); Tensor w_sub = padding_data.Slice(k, k + padding_size); - out_t_sub.CopyFrom(w_sub, context.GetPlace(), context); + framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub); } } if (down_pad > 0) { // add down pad @@ -179,7 +179,7 @@ class ContextProjectFunctor { (down_pad_begin_row + t) * context_length); Tensor w_sub = padding_data.Slice( up_pad + padding_idx, up_pad + padding_idx + padding_size); - out_t_sub.CopyFrom(w_sub, context.GetPlace(), context); + framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub); } } out_t.Resize({sequence_height, context_length * sequence_width}); diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h index deb60051be..24fd9a06e9 100644 --- a/paddle/operators/math/im2col.h +++ b/paddle/operators/math/im2col.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/framework/tensor.h" +#include "paddle/framework/tensor_util.h" #include "paddle/platform/device_context.h" namespace paddle { diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index 10c28da72b..ae197a97ed 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -74,7 +74,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + CopyFrom(input_tmp, *place, *context, &input); } output_cfo.mutable_data( {1, filter_size, filter_size, output_height, output_width}, *place); @@ -99,7 +99,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); } else { - output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context); + CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp); out_cfo_ptr = output_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -110,7 +110,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_ocf_ptr = output_ocf.data(); } else { - output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context); + CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp); out_ocf_ptr = output_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -130,7 +130,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + CopyFrom(input_tmp, *place, *context, &input); } col2im(*context, output_cfo, dilation, stride, padding, &input); @@ -139,7 +139,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); + CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -151,7 +151,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + CopyFrom(input_tmp, *place, *context, &input); } col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); @@ -159,7 +159,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); + CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index ffb99f5380..5a42854f22 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -49,6 +49,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda, #include "paddle/framework/eigen.h" #include "paddle/framework/tensor.h" +#include "paddle/framework/tensor_util.h" #include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu index 780d17ffc6..d5d6f0c73b 100644 --- a/paddle/operators/math/math_function_test.cu +++ b/paddle/operators/math/math_function_test.cu @@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input1, *gpu_place, context); + paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); + paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu); out_gpu.mutable_data({2, 2}, *gpu_place); paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - out.CopyFrom(out_gpu, *cpu_place, context); + paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input1, *gpu_place, context); + paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); + paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu); out_gpu.mutable_data({3, 3}, *gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - out.CopyFrom(out_gpu, *cpu_place, context); + paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input2, *gpu_place, context); - input3_gpu.CopyFrom(input3, *gpu_place, context); + paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); + paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu); + paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); @@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - input3.CopyFrom(input3_gpu, *cpu_place, context); + paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input2, *gpu_place, context); - input3_gpu.CopyFrom(input3, *gpu_place, context); + paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); + paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu); + paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); @@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - input3.CopyFrom(input3_gpu, *cpu_place, context); + paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); context.Wait(); EXPECT_EQ(input3_ptr[0], 0); @@ -205,14 +205,15 @@ void GemvTest(int m, int n, bool trans) { } paddle::platform::CUDADeviceContext context(*gpu_place); - g_mat_a.CopyFrom(mat_a, *gpu_place, context); - g_vec_b.CopyFrom(vec_b, *gpu_place, context); + paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a); + paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b); paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - vec_c.CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context); + paddle::framework::CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context, + &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu index 09de9dc53a..7de9291c17 100644 --- a/paddle/operators/math/selected_rows_functor_test.cu +++ b/paddle/operators/math/selected_rows_functor_test.cu @@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) { EXPECT_EQ(out_rows[6], 9); Tensor out_cpu; - out_cpu.CopyFrom(*out_value, cpu_place, ctx); + CopyFrom(*out_value, cpu_place, ctx, &out_cpu); ctx.Wait(); auto* out_cpu_data = out_cpu.data(); @@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) { add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); Tensor tensor2_cpu; - tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx); + CopyFrom(*tensor2, cpu_place, ctx, &tensor2_cpu); ctx.Wait(); auto* tensor2_cpu_data = tensor2_cpu.data(); @@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) { EXPECT_EQ(out_rows[6], 9); Tensor out_cpu; - out_cpu.CopyFrom(*out_value, cpu_place, ctx); + CopyFrom(*out_value, cpu_place, ctx, &out_cpu); ctx.Wait(); auto* out_cpu_data = out_cpu.data(); @@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) { add_to_tensor_functor(ctx, *output, tensor1.get()); Tensor tensor1_cpu; - tensor1_cpu.CopyFrom(*tensor1, cpu_place, ctx); + CopyFrom(*tensor1, cpu_place, ctx, &tensor1_cpu); ctx.Wait(); auto* tensor1_cpu_data = tensor1_cpu.data(); diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h index cbc30bd754..dc64d1d977 100644 --- a/paddle/operators/math/vol2col.h +++ b/paddle/operators/math/vol2col.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/framework/tensor.h" +#include "paddle/framework/tensor_util.h" #include "paddle/platform/device_context.h" namespace paddle { diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index c31c716842..62c3152304 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -82,7 +82,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + CopyFrom(input_tmp, *place, *context, &input); } output.mutable_data({1, filter_size, filter_size, filter_size, output_depth, output_height, output_width}, @@ -96,7 +96,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context); + CopyFrom(output, paddle::platform::CPUPlace(), *context, &output_tmp); out_cfo_ptr = output_tmp.data(); } @@ -110,7 +110,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + CopyFrom(input_tmp, *place, *context, &input); } paddle::operators::math::Col2VolFunctor col2vol; @@ -120,7 +120,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); + CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp); in_ptr = input_tmp.data(); } diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc index 80460c4769..adc688dbd5 100644 --- a/paddle/operators/merge_lod_tensor_op.cc +++ b/paddle/operators/merge_lod_tensor_op.cc @@ -45,7 +45,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { #ifdef PADDLE_WITH_CUDA - cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx); + framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); #endif @@ -99,8 +99,9 @@ class MergeLoDTensorOp : public framework::OperatorBase { if (len == 0) { continue; } - out->Slice(out_offset, out_offset + len) - .CopyFrom(input->Slice(start_offset, end_offset), place, dev_ctx); + auto slice = out->Slice(out_offset, out_offset + len); + framework::CopyFrom(input->Slice(start_offset, end_offset), place, + dev_ctx, &slice); out_offset += len; (*in_idx) += 1; } diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 49ed8a8879..10dff8d021 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context()); + CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); Place place = boost::get(ctx.GetPlace()); @@ -68,7 +68,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context()); + CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index 56ba578549..bb7ae20286 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -97,7 +97,7 @@ class NCCLTester : public ::testing::Test { send_tensor->mutable_data(kDims, place); std::vector send_vector(f::product(kDims), gpu_id); - send_tensor->CopyFromVector(send_vector, *ctx); + paddle::framework::CopyFromVector(send_vector, *ctx, send_tensor); ctx->Wait(); VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel(); } diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index ea60665e39..c976e22c77 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -284,7 +284,8 @@ class RecurrentOp : public RecurrentBase { auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); // Explicit copy output since the local RNN scope can be destroyed // early. - dst_out.CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx); + framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx, + &dst_out); }); scopes.Next(); @@ -365,7 +366,8 @@ class RecurrentGradOp : public RecurrentBase { auto *cur_grad_var = cur_scope.Var(cur_grad); auto cur_grad_tensor = cur_grad_var->GetMutable(); - cur_grad_tensor->CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx); + framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx, + cur_grad_tensor); } } @@ -438,7 +440,7 @@ class RecurrentGradOp : public RecurrentBase { } auto dst = outside->Slice(seq_offset, seq_offset + 1); - dst.CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx); + framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst); }); VLOG(5) << "Link outside gradient finished "; @@ -451,7 +453,7 @@ class RecurrentGradOp : public RecurrentBase { framework::LoDTensor *outside) { outside->Resize(inside.dims()); outside->mutable_data(dev_ctx.GetPlace(), inside.type()); - outside->CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx); + framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside); }); VLOG(5) << "Link initialize state gradient finished "; } diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h index beb951713a..0e98c8b4f4 100644 --- a/paddle/operators/reshape_op.h +++ b/paddle/operators/reshape_op.h @@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel { auto* in = ctx.Input("X"); auto out_dims = out->dims(); out->mutable_data(ctx.GetPlace()); - out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context()); + framework::CopyFrom(*in, ctx.GetPlace(), ctx.device_context(), out); out->Resize(out_dims); } }; @@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel { d_x->mutable_data(ctx.GetPlace()); auto in_dims = d_x->dims(); - d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context()); + framework::CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); d_x->Resize(in_dims); } }; diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc deleted file mode 100644 index ee61ea300c..0000000000 --- a/paddle/operators/rnn/recurrent_op_utils.cc +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/operators/rnn/recurrent_op_utils.h" - -namespace paddle { -namespace operators { -namespace rnn { - -namespace f = paddle::framework; - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -void SegmentInputs(const std::vector& step_scopes, - const std::vector& inlinks, - const size_t seq_len) { - PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided."); - for (size_t i = 0; i < inlinks.size(); ++i) { - // global inputs - auto input_var = step_scopes[0]->parent().FindVar(inlinks[i]); - PADDLE_ENFORCE_NOT_NULL(input_var, "input link [%s] is not in scope.", - inlinks[i]); - - LoDTensor* input = input_var->GetMutable(); - f::DDim dims = input->dims(); - PADDLE_ENFORCE_EQ(static_cast(dims[0]), seq_len, - "all the inputs be the same length"); - f::DDim step_dims = slice_ddim(dims, 1, dims.size()); - for (size_t j = 0; j < seq_len; j++) { - Tensor* step_input = - step_scopes[j]->Var(inlinks[i])->GetMutable(); - // The input of operators of each step is Tensor here. - // Maybe need to modify Slice function. - *step_input = input->Slice(j, j + 1); - step_input->Resize(step_dims); - } - } -} - -void ConcatOutputs(const std::vector& step_scopes, - const std::vector& outlinks, - const size_t seq_len, const platform::DeviceContext& ctx) { - for (size_t i = 0; i < outlinks.size(); i++) { - auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]); - PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.", - outlinks[i]); - LoDTensor* output = output_var->GetMutable(); - - auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]); - PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]); - f::DDim step_dims = - step_scope_var->template GetMutable()->dims(); - std::vector dims_vec = vectorize(step_dims); - dims_vec.insert(dims_vec.begin(), seq_len); - output->Resize(f::make_ddim(dims_vec)); - output->mutable_data(platform::CPUPlace()); - for (size_t j = 0; j < seq_len; j++) { - LoDTensor* step_output = - step_scopes[j]->FindVar(outlinks[i])->GetMutable(); - // TODO(luotao02) data type and platform::DeviceContext() should set - // correctly - (output->Slice(j, j + 1)) - .CopyFrom(*step_output, platform::CPUPlace(), ctx); - } - } -} - -void LinkMemories(const std::vector& scopes, - const std::vector& memories, - const size_t step_id, const int offset) { - PADDLE_ENFORCE_LT(step_id, scopes.size(), - "step [%d] is out of range of step scopes' size [%d]", - step_id, scopes.size()); - PADDLE_ENFORCE_GE(static_cast(step_id) + offset, 0, - "offset [%d] must be large than -[%d]", offset, step_id); - PADDLE_ENFORCE_LT( - step_id + offset, scopes.size(), - "offset [%d] is out of range, it must be less than (%d - %d)", offset, - scopes.size(), step_id); - auto* scope = scopes[step_id]; - auto* linked_scope = scopes[step_id + offset]; - for (auto& attr : memories) { - auto* mem = scope->FindVar(attr.pre_var)->GetMutable(); - auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable(); - mem->Resize(linked_mem->dims()); - mem->ShareDataWith(*linked_mem); - } -} - -void InitArgument(const ArgumentName& name, Argument* arg, - const framework::OperatorBase& op, bool is_grad) { - arg->step_scopes = - is_grad ? op.Input(name.step_scopes) : op.Output(name.step_scopes); - arg->inlinks = op.Inputs(name.inlinks); - arg->outlinks = op.Outputs(name.outlinks); - - auto& boot_memories = is_grad ? op.Outputs(name.initial_states) - : op.Inputs(name.initial_states); - // attributes - auto& memories = op.Attr>(name.states); - auto& pre_memories = op.Attr>(name.ex_states); - - PADDLE_ENFORCE(memories.size() == boot_memories.size(), - "the size of states, initial_states don't match:%d,%d", - memories.size(), boot_memories.size()); - PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), - "the size of ex_states, initial_states don't match:%d,%d", - pre_memories.size(), boot_memories.size()); - PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set"); - - for (size_t i = 0; i < memories.size(); ++i) { - rnn::StateAttr mem_attr; - mem_attr.var = memories[i]; - mem_attr.pre_var = pre_memories[i]; - mem_attr.boot_var = boot_memories[i]; - (arg->states).push_back(mem_attr); - } -} - -} // namespace rnn -} // namespace operators -} // namespace paddle diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h deleted file mode 100644 index fb0e158e07..0000000000 --- a/paddle/operators/rnn/recurrent_op_utils.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once - -#include - -#include "paddle/framework/operator.h" - -namespace paddle { -namespace operators { -namespace rnn { - -using Scope = framework::Scope; - -/** - * Memory of a RNN (same as the role of `Momory` in PaddlePaddle). - * - * Memory attributes cached by this op, dims will be infered from - * boot memories in father scope. Other attributes are copied from Op's proto - * attributes. - */ -struct StateAttr { - // name of current state variable - std::string var; - // name of previous step's state variable - std::string pre_var; - // name of the variables to init this memory (same role of `boot_layer` in - // PaddlePaddle), which is store in father's scope. - std::string boot_var; -}; - -struct Argument { - std::string step_net; - std::string step_scopes; - std::vector inlinks; - std::vector outlinks; - std::vector states; -}; - -struct ArgumentName { - std::string step_net; - std::string step_scopes; - std::string inlinks; - std::string outlinks; - std::string states; // the memory name - std::string ex_states; // the previous memory name - std::string initial_states; // the boot memory name -}; - -/** - * Prepare inputs for each step net. - */ -void SegmentInputs(const std::vector& step_scopes, - const std::vector& inlinks, - const size_t seq_len); - -/** - * Process outputs of step nets and merge to variables. - */ -void ConcatOutputs(const std::vector& step_scopes, - const std::vector& outlinks, - const size_t seq_len, const platform::DeviceContext& ctx); - -void LinkMemories(const std::vector& step_scopes, - const std::vector& memories, const size_t step_id, - const int offset); - -void InitArgument(const ArgumentName& name, Argument* arg, - const framework::OperatorBase& op, bool is_grad = false); - -} // namespace rnn -} // namespace operators -} // namespace paddle diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h old mode 100755 new mode 100644 index 2c9b8464a1..6411e0a466 --- a/paddle/operators/sequence_slice_op.h +++ b/paddle/operators/sequence_slice_op.h @@ -26,7 +26,7 @@ using LoD = framework::LoD; template inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data, - const int64_t* length_data) { + const int64_t* length_data) { auto out_lod = in.lod(); size_t lod_offset = 0; @@ -34,7 +34,7 @@ inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data, out_lod[0][0] = 0; for (size_t i = 0; i < n; ++i) { lod_offset += length_data[i]; - out_lod[0][i+1] = lod_offset; + out_lod[0][i + 1] = lod_offset; } return out_lod; } @@ -51,8 +51,7 @@ class SequenceSliceOpKernel : public framework::OpKernel { auto lod = in->lod(); auto n = lod[0].size() - 1; - PADDLE_ENFORCE_EQ(lod.size(), 1UL, - "Only support one level sequence now."); + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ( n, static_cast(length->dims()[0]), "The size of input-sequence and length-array should be the same") @@ -67,23 +66,23 @@ class SequenceSliceOpKernel : public framework::OpKernel { if (platform::is_gpu_place(ctx.GetPlace())) { offset_cpu.mutable_data(offset->dims(), platform::CPUPlace()); - offset_cpu.CopyFrom(*offset, platform::CPUPlace(), ctx.device_context()); + framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(), + &offset_cpu); offset_data = offset_cpu.data(); length_cpu.mutable_data(length->dims(), platform::CPUPlace()); - length_cpu.CopyFrom(*length, platform::CPUPlace(), ctx.device_context()); + framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(), + &length_cpu); length_data = length_cpu.data(); } for (size_t i = 0; i < n; ++i) { PADDLE_ENFORCE_LT(0, offset_data[i], - "The offset[%d] must greater than zero.", i) + "The offset[%d] must greater than zero.", i) PADDLE_ENFORCE_LT(0, length_data[i], - "The length[%d] must greater than zero.", i) - PADDLE_ENFORCE_LT( - lod[0][i] + offset_data[i] + length_data[i], - lod[0][i + 1], - "The target tensor's length overflow.") + "The length[%d] must greater than zero.", i) + PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], + lod[0][i + 1], "The target tensor's length overflow.") } out->mutable_data(ctx.GetPlace()); @@ -98,14 +97,12 @@ class SequenceSliceOpKernel : public framework::OpKernel { size_t out_offset = 0; for (size_t i = 0; i < n; ++i) { - Tensor in_t = - in->Slice(static_cast(lod[0][i] + offset_data[i]), - static_cast(lod[0][i] + offset_data[i] + - length_data[i])); - - StridedMemcpy(ctx.device_context(), in_t.data(), - in_stride, in_t.dims(), out_stride, - out->data() + out_offset); + Tensor in_t = in->Slice( + static_cast(lod[0][i] + offset_data[i]), + static_cast(lod[0][i] + offset_data[i] + length_data[i])); + + StridedMemcpy(ctx.device_context(), in_t.data(), in_stride, + in_t.dims(), out_stride, out->data() + out_offset); out_offset += length_data[i] * in_stride[0]; } } @@ -130,11 +127,13 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { if (platform::is_gpu_place(ctx.GetPlace())) { offset_cpu.mutable_data(offset->dims(), platform::CPUPlace()); - offset_cpu.CopyFrom(*offset, platform::CPUPlace(), ctx.device_context()); + framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(), + &offset_cpu); offset_data = offset_cpu.data(); length_cpu.mutable_data(length->dims(), platform::CPUPlace()); - length_cpu.CopyFrom(*length, platform::CPUPlace(), ctx.device_context()); + framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(), + &length_cpu); length_data = length_cpu.data(); } @@ -162,8 +161,8 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { static_cast(lod[0][i] + offset_data[i] + length_data[i])); StridedMemcpy(ctx.device_context(), out_grad_t.data(), - out_grad_stride, out_grad_t.dims(), x_grad_stride, - x_grad_t.data()); + out_grad_stride, out_grad_t.dims(), x_grad_stride, + x_grad_t.data()); } } } diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index 65bccc0c81..48597c1d2a 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -101,8 +101,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { } else { auto &dout_tensor = dout_var->Get(); auto height = dout_tensor.dims()[0]; - dx_tensor.Slice(0, static_cast(height)) - .CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx); + auto slice = dx_tensor.Slice(0, static_cast(height)); + framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice); if (dx_tensor.dims()[0] < height) { auto rest_tensor = dx_tensor.Slice( static_cast(height), static_cast(dout_tensor.dims()[0])); diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc index db635f2ba0..f164a47711 100644 --- a/paddle/operators/split_lod_tensor_op.cc +++ b/paddle/operators/split_lod_tensor_op.cc @@ -49,7 +49,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { cpu_mask->ShareDataWith(mask); } else if (platform::is_gpu_place(mask.place())) { #ifdef PADDLE_WITH_CUDA - cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx); + framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); #else PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); #endif @@ -105,10 +105,11 @@ class SplitLoDTensorOp : public framework::OperatorBase { continue; } // out[offset: offset+len] = x[each_range.begin: each_range.end] - out->Slice(static_cast(offset), static_cast(offset + len)) - .CopyFrom(x.Slice(static_cast(each_range.begin), - static_cast(each_range.end)), - x.place(), dev_ctx); + auto slice = out->Slice(static_cast(offset), + static_cast(offset + len)); + framework::CopyFrom(x.Slice(static_cast(each_range.begin), + static_cast(each_range.end)), + x.place(), dev_ctx, &slice); offset += len; } } diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index 4ca1561139..4afec03ece 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -102,8 +102,8 @@ class SumKernel : public framework::OpKernel { out_array.resize(i + 1); } if (out_array[i].numel() == 0) { - out_array[i].CopyFrom(in_array[i], in_array[i].place(), - context.device_context()); + framework::CopyFrom(in_array[i], in_array[i].place(), + context.device_context(), &out_array[i]); out_array[i].set_lod(in_array[i].lod()); } else { PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); diff --git a/paddle/operators/tensor.save b/paddle/operators/tensor.save new file mode 100644 index 0000000000..c24308a7d0 Binary files /dev/null and b/paddle/operators/tensor.save differ diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index ae1b48d7a8..ad09fb53ce 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -38,7 +38,7 @@ class WriteToArrayOp : public ArrayOp { out->resize(offset + 1); } auto *out_tensor = &out->at(offset); - out_tensor->CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx); + CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor); out_tensor->set_lod(x_tensor.lod()); } }; @@ -116,7 +116,8 @@ class ReadFromArrayOp : public ArrayOp { auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, dev_ctx); PADDLE_ENFORCE_LT(offset, x_array.size()); - out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx); + framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx, + out_tensor); out_tensor->set_lod(x_array[offset].lod()); } }; diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index a9bcc47438..a54dc0d9fd 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1,8 +1,8 @@ if(WITH_PYTHON) cc_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc - DEPS pybind python backward proto_desc tensor_array paddle_memory executor prune + DEPS pybind python backward proto_desc paddle_memory executor prune ${GLOB_OP_LIB}) endif(WITH_PYTHON) -cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB} tensor_array) +cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB}) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index e697739cc6..f55a1edce3 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -26,9 +26,7 @@ limitations under the License. */ #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/prune.h" #include "paddle/framework/selected_rows.h" -#include "paddle/framework/tensor_array.h" #include "paddle/operators/cond_op.h" -#include "paddle/operators/dynamic_recurrent_op.h" #include "paddle/operators/net_op.h" #include "paddle/platform/enforce.h" #include "paddle/platform/place.h" @@ -395,83 +393,6 @@ All parameter, weight, gradient are variables in Paddle. self->CompleteAddOp(); }); - py::class_(m, "TensorArray") - .def("__init__", - [](TensorArray &instance) { new (&instance) TensorArray(); }) - .def("read", - [](TensorArray &self, size_t index) { return self.Read(index); }) - .def("write", [](TensorArray &self, size_t index, - LoDTensor &value) { self.Write(index, value); }) - .def("write_shared", - [](TensorArray &self, size_t index, const LoDTensor &value) { - self.WriteShared(index, value); - }) - .def("size", [](TensorArray &self) { return self.size(); }) - .def("pack", - [](TensorArray &self, size_t level, - const std::vector> &meta_info, - const std::vector> &lod) { - std::vector meta; - for (auto &info : meta_info) { - PADDLE_ENFORCE_EQ(info.size(), 3UL); - meta.emplace_back(info[0], info[1], info[2]); - } -#ifndef PADDLE_WITH_CUDA - return self.Pack(level, meta, lod); -#else - LoD new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return self.Pack(level, meta, new_lod); -#endif - }) - .def("unpack", - [](TensorArray &self, const LoDTensor &source, int level, - bool length_descend) { - auto metas = self.Unpack(source, level, length_descend); - std::vector> meta_info; - for (auto meta : metas) { - meta_info.emplace_back( - std::vector({meta.begin, meta.end, meta.ori_idx})); - } - return meta_info; - }) - .def("stack", [](TensorArray &self) { return self.Stack(); }) - .def("unstack", - [](TensorArray &self, const LoDTensor &source) { - return self.Unstack(source); - }) - .def("unstack_shared", [](TensorArray &self, const LoDTensor &source) { - return self.UnstackShared(source); - }); - - py::class_(m, - "DynamicRecurrentOp") - .def_static("create", - [](py::bytes protobin) -> operators::DynamicRecurrentOp * { - OpDesc desc; - PADDLE_ENFORCE(desc.ParsePartialFromString(protobin), - "Cannot parse user input to OpDesc"); - PADDLE_ENFORCE(desc.IsInitialized(), - "User OpDesc is not initialized, reason %s", - desc.InitializationErrorString()); - auto rnn_op = OpRegistry::CreateOp(desc); - return static_cast( - rnn_op.release()); - }) - .def("set_step_unit", - [](operators::DynamicRecurrentOp &self, const operators::NetOp &net) - -> void { self.rnn.SetStepUnit(net.Clone()); }) - .def("get_state", - [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.rnn.state(name); }) - .def("get_step_input", - [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.rnn.step_input(name); }) - .def("get_step_output", - [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.rnn.step_output(name); }); - // cond_op py::class_(m, "CondOp") .def_static("create", diff --git a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py deleted file mode 100644 index c2d8b48ea9..0000000000 --- a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py +++ /dev/null @@ -1,171 +0,0 @@ -import logging -import paddle.v2.fluid.core as core -import unittest -from paddle.v2.fluid.op import Operator, DynamicRecurrentOp -import numpy as np - -# for siplicity, just one level LoD -lod_py = [[0, 4, 7, 9, 10]] -input_dim = 30 -num_sents = len(lod_py[0]) - 1 -weight_dim = 15 - - -def create_tensor(scope, name, shape, np_data): - tensor = scope.var(name).get_tensor() - tensor.set_dims(shape) - tensor.set(np_data, core.CPUPlace()) - return tensor - - -class PyRNNStep(object): - def __init__(self): - - self.x = np.random.normal(size=(lod_py[0][-1], - input_dim)).astype("float32") - self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32") - self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32") - self.h_boot = np.random.normal(size=(num_sents, - input_dim)).astype("float32") - - -class DynamicRecurrentOpTest(unittest.TestCase): - ''' - Test RNNOp - - equation: - h_t = \sigma (W x_t + U h_{t-1}) - weights: - - W - - U - vars: - - x - states: - - h - outputs: - - h - ''' - - py = PyRNNStep() - - def forward(self): - self.scope = core.Scope() - self.create_global_variables() - self.create_rnn_op() - self.create_step_net() - ctx = core.DeviceContext.create(core.CPUPlace()) - self.rnnop.run(self.scope, ctx) - state = self.rnnop.get_state("h@state") - print 'state size: ', state.size() - - step_inputs = self.rnnop.get_step_input("x") - print "x size ", step_inputs.size() - for i in range(step_inputs.size()): - print "x %d" % i, np.array(step_inputs.read(i).get_dims()) - step_outputs = self.rnnop.get_step_output('h@state') - print 'step_outputs.size ', step_outputs.size() - output = self.scope.find_var("h@state").get_tensor() - print 'output', np.array(output).shape - - def create_global_variables(self): - # create inlink - x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim], - self.py.x) - x_tensor.set_lod(lod_py) - create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W) - create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U) - create_tensor(self.scope, "h_boot", [num_sents, input_dim], - self.py.h_boot) - self.scope.var("step_scopes") - self.scope.var("h@state") - - def create_rnn_op(self): - # create RNNOp - self.rnnop = DynamicRecurrentOp( - # inputs - inputs=["x"], - initial_states=["h_boot"], - step_net="step_unit", - # outputs - outputs=["h@state"], - step_scopes="step_scopes", - # attributes - ex_states=["h@pre"], - states=["h@state"]) - - def create_step_net(self): - step_unit = core.Net.create() - x_fc_op = Operator("mul", X="x", Y="W", Out="Wx") - h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh") - sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum") - sig_op = Operator("sigmoid", X="sum", Y="h@state") - - for op in [x_fc_op, h_fc_op, sum_op, sig_op]: - step_unit.append_op(op) - step_unit.complete_add_op(True) - self.rnnop.set_step_unit(step_unit) - - def test_forward(self): - print 'test recurrent op forward' - pd_output = self.forward() - print 'pd_output', pd_output - - -class RecurrentGradientOpTest(unittest.TestCase): - py = PyRNNStep() - - def create_forward_op(self): - # create RNNOp - self.forward_op = DynamicRecurrentOp( - # inputs - inputs=["x"], - initial_states=["h_boot"], - step_net="step_unit", - # outputs - outputs=["h@state"], - step_scopes="step_scopes", - # attributes - ex_states=["h@pre"], - states=["h@state"]) - - def create_gradient_op(self): - a = set() - backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a) - - def create_step_net(self): - step_unit = core.Net.create() - x_fc_op = Operator("mul", X="x", Y="W", Out="Wx") - h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh") - sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum") - sig_op = Operator("sigmoid", X="sum", Y="h@state") - - for op in [x_fc_op, h_fc_op, sum_op, sig_op]: - step_unit.append_op(op) - step_unit.complete_add_op(True) - self.forward_op.set_step_unit(step_unit) - - def create_global_variables(self): - # create inlink - x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim], - self.py.x) - x_tensor.set_lod(lod_py) - create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W) - create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U) - create_tensor(self.scope, "h_boot", [num_sents, input_dim], - self.py.h_boot) - self.scope.var("step_scopes") - self.scope.var("h@state") - - def test_grad(self): - self.scope = core.Scope() - self.create_forward_op() - self.create_global_variables() - self.create_step_net() - self.create_gradient_op() - - -if __name__ == '__main__': - exit( - 0 - ) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 - unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_nccl_init_op.py b/python/paddle/v2/fluid/tests/test_nccl_init_op.py deleted file mode 100644 index a536800ccd..0000000000 --- a/python/paddle/v2/fluid/tests/test_nccl_init_op.py +++ /dev/null @@ -1,39 +0,0 @@ -import unittest, os -import numpy as np -import paddle.v2 as paddle -from paddle.v2.fluid.op import Operator -import paddle.v2.fluid.core as core -from op_test import OpTest, create_op, set_input - -if not core.is_compile_gpu(): - exit(0) - -gpu_count = core.get_cuda_device_count() - -if gpu_count <= 1: - exit(0) - -g_scope = core.Scope() -g_ctx = core.DeviceContext.create(core.CPUPlace()) - - -class TestNCCLInit(unittest.TestCase): - def test_init(self): - self.op_type = "ncclInit" - self.gpus = range(gpu_count) - - self.inputs = {} - self.attrs = {"gpus": self.gpus} - g_scope.var("Communicator").get_communicator() - self.outputs = {"Communicator": g_scope.find_var("Communicator")} - nccl_init = create_op( - g_scope, - op_type=self.op_type, - inputs=self.inputs, - outputs=self.outputs, - attrs=self.attrs) - nccl_init.run(g_scope, g_ctx) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_tensor_array.py b/python/paddle/v2/fluid/tests/test_tensor_array.py deleted file mode 100644 index d6929ba16e..0000000000 --- a/python/paddle/v2/fluid/tests/test_tensor_array.py +++ /dev/null @@ -1,106 +0,0 @@ -import logging -import paddle.v2.fluid.core as core -import unittest -import numpy as np - - -class TestTensorArray(unittest.TestCase): - def setUp(self): - self.ta = core.TensorArray() - - self.batch_size = 10 - self.dim = 2 - - # create a LoDTensor - self.scope = core.Scope() - var = self.scope.var("test_tensor") - self.place = core.CPUPlace() - tensor = var.get_tensor() - tensor.set_dims([self.batch_size, self.dim]) - tensor.alloc_float(self.place) - tensor_array = np.array(tensor) - tensor_array[0, 0] = 0 - tensor_array[1, 0] = 1 - tensor_array[2, 0] = 2 - tensor_array[3, 0] = 3 - tensor_array[4, 0] = 4 - tensor_array[5, 0] = 5 - tensor_array[6, 0] = 6 - tensor_array[7, 0] = 7 - tensor_array[8, 0] = 8 - tensor_array[9, 0] = 9 - - lod_py = [[0, 2, 5, 10]] - lod_tensor = core.LoDTensor(lod_py) - lod_tensor.set(tensor_array, self.place) - - self.py_seq_meta = [[5, 10, 2], [2, 5, 1], [0, 2, 0]] - - self.tensor = lod_tensor - - def test_unstack(self): - self.ta.unstack(self.tensor) - self.assertEqual(self.tensor.get_dims()[0], self.ta.size()) - - def test_read(self): - self.ta.unstack(self.tensor) - for i in range(self.batch_size): - tensor = self.ta.read(i) - - def test_write(self): - self.ta.unstack(self.tensor) - - # create a tensor with shape of [1, self.dim] - var = self.scope.var("hell") - tensor = var.get_tensor() - tensor.set_dims([1, self.dim]) - tensor.alloc_float(self.place) - tensor_array = np.array(tensor) - for i in range(self.dim): - tensor_array[0, i] = i - tensor.set(tensor_array, self.place) - - self.ta.write(2, tensor) - - ta_tensor = self.ta.read(2) - ta_tensor_array = np.array(ta_tensor) - self.assertEqual(ta_tensor.get_dims(), [1, self.dim]) - self.assertTrue((tensor_array == ta_tensor_array).all()) - - def test_write_shared(self): - self.ta.unstack(self.tensor) - - # create a tensor with shape of [1, self.dim] - var = self.scope.var("hell") - tensor = var.get_tensor() - tensor.set_dims([1, self.dim]) - tensor.alloc_float(self.place) - tensor_array = np.array(tensor) - for i in range(self.dim): - tensor_array[0, i] = i - tensor.set(tensor_array, self.place) - - self.ta.write_shared(2, tensor) - - ta_tensor = self.ta.read(2) - ta_tensor_array = np.array(ta_tensor) - self.assertEqual(ta_tensor.get_dims(), [1, self.dim]) - self.assertTrue((tensor_array == ta_tensor_array).all()) - - def test_unpack(self): - meta = self.ta.unpack(self.tensor, 0, True) - self.assertEqual(self.ta.size(), 5) - self.assertEqual(meta, self.py_seq_meta) - - def test_pack(self): - meta = self.ta.unpack(self.tensor, 0, True) - print "meta", meta - tensor = self.ta.pack(0, meta, self.tensor.lod()) - print np.array(self.tensor) - print np.array(tensor) - self.assertTrue((np.array(self.tensor) == np.array(tensor)).all()) - self.assertTrue(tensor.lod(), self.tensor.lod()) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/v2/fluid/tests/tmp/inference_model/__model__ b/python/paddle/v2/fluid/tests/tmp/inference_model/__model__ new file mode 100644 index 0000000000..e333d10da9 Binary files /dev/null and b/python/paddle/v2/fluid/tests/tmp/inference_model/__model__ differ diff --git a/python/paddle/v2/fluid/tests/tmp/inference_model/fc_0.b_0 b/python/paddle/v2/fluid/tests/tmp/inference_model/fc_0.b_0 new file mode 100644 index 0000000000..b1e5fad056 Binary files /dev/null and b/python/paddle/v2/fluid/tests/tmp/inference_model/fc_0.b_0 differ diff --git a/python/paddle/v2/fluid/tests/tmp/inference_model/fc_0.w_0 b/python/paddle/v2/fluid/tests/tmp/inference_model/fc_0.w_0 new file mode 100644 index 0000000000..2f41796c04 Binary files /dev/null and b/python/paddle/v2/fluid/tests/tmp/inference_model/fc_0.w_0 differ diff --git a/python/paddle/v2/framework/tests/test_elementwise_mod_op.py b/python/paddle/v2/framework/tests/test_elementwise_mod_op.py new file mode 100644 index 0000000000..35c38147a2 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_elementwise_mod_op.py @@ -0,0 +1,36 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class ElementwiseModOp(OpTest): + def setUp(self): + self.op_type = "elementwise_mod" + """ Warning + CPU gradient check error! + 'X': np.random.randint((32,84)).astype("int32"), + 'Y': np.random.randint((32,84)).astype("int32") + """ + self.inputs = { + 'X': np.random.randint(1, 10, [13, 17]).astype("int32"), + 'Y': np.random.randint(1, 10, [13, 17]).astype("int32") + } + self.outputs = {'Out': np.mod(self.inputs['X'], self.inputs['Y'])} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y')) + + +if __name__ == '__main__': + unittest.main()