Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

revert-4814-Add_sequence_project_op
zchen0211 8 years ago
commit 502e72591f

@ -189,7 +189,7 @@ OpDesc {
inputs = {0} // the index of x in vars of BlockDesc above inputs = {0} // the index of x in vars of BlockDesc above
outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
attrs { attrs {
"memories" : {1} // the index of h "states" : {1} // the index of h
"step_net" : <above step net> "step_net" : <above step net>
} }
}; };

@ -28,23 +28,37 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
add_dependencies(paddle_capi paddle_proto) add_dependencies(paddle_capi paddle_proto)
# combine all paddle static libraries together, into libpaddle_capi_whole.a # TODO: paddle_capi_whole will be removed.
# user should use PaddleCAPI as -lpaddle_capi_whole if(MOBILE_INFERENCE)
set(PADDLE_CAPI_INFER_LIBS set(PADDLE_CAPI_INFER_LIBS
paddle_utils paddle_utils
paddle_parameter paddle_parameter
paddle_math paddle_math
paddle_cuda paddle_cuda
paddle_function paddle_function
paddle_gserver paddle_gserver
paddle_proto) paddle_proto)
else()
set(PADDLE_CAPI_INFER_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_function
paddle_gserver
paddle_proto
paddle_pserver
paddle_network)
endif()
cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS}) cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
# No shared library for iOS # Link the static library for inference
cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
# Link the shared library for inference
if(NOT IOS) if(NOT IOS)
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map") set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
# TODO: merge mkl into paddle_capi_shared
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES}) add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
@ -53,9 +67,10 @@ endif()
# install library & headers. # install library & headers.
install(FILES ${CAPI_HEADERS} DESTINATION include/paddle) install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
install(FILES paddle_capi.map DESTINATION include/paddle)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
if(ANDROID) if(ANDROID)
install(TARGETS paddle_capi_whole paddle_capi_shared install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
ARCHIVE DESTINATION lib/${ANDROID_ABI} ARCHIVE DESTINATION lib/${ANDROID_ABI}
LIBRARY DESTINATION lib/${ANDROID_ABI}) LIBRARY DESTINATION lib/${ANDROID_ABI})
execute_process( execute_process(
@ -80,7 +95,7 @@ if(ANDROID)
)" )"
) )
else(ANDROID) else(ANDROID)
install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib) install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
if(NOT IOS) if(NOT IOS)
install(TARGETS paddle_capi_shared DESTINATION lib) install(TARGETS paddle_capi_shared DESTINATION lib)
endif() endif()

@ -21,6 +21,7 @@
#include "paddle/framework/block_desc.h" #include "paddle/framework/block_desc.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/dynamic_recurrent_op.h"
#include "paddle/operators/net_op.h" #include "paddle/operators/net_op.h"
#include "paddle/operators/recurrent_op.h" #include "paddle/operators/recurrent_op.h"
@ -220,8 +221,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
// process recurrent gradient op as a special operator. // process recurrent gradient op as a special operator.
if (forwardOp.Type() == "recurrent") { if (forwardOp.Type() == "recurrent") {
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself), // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
// or // or this will result in infinite loop.
// this will result in infinite loop.
const auto& rnnop = const auto& rnnop =
*static_cast<const operators::RecurrentOp*>(&forwardOp); *static_cast<const operators::RecurrentOp*>(&forwardOp);
auto rnn_grad_op = auto rnn_grad_op =
@ -231,6 +231,18 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
// create stepnet's gradient op // create stepnet's gradient op
rnn_grad_op->set_stepnet( rnn_grad_op->set_stepnet(
BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id)); BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
} else if (forwardOp.Type() == "dynamic_recurrent") {
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
// or this will result in infinite loop.
const auto& rnnop =
*static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
auto rnn_grad_op =
static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
const auto& stepnet_op =
*static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
// create stepnet's gradient op
rnn_grad_op->rnn.SetStepUnit(
BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
} }
if (net->ops_.empty()) { // Current no aux op is added to network if (net->ops_.empty()) { // Current no aux op is added to network

@ -26,6 +26,8 @@ inline DataType ToDataType(std::type_index type) {
return DataType::FP64; return DataType::FP64;
} else if (typeid(int).hash_code() == type.hash_code()) { } else if (typeid(int).hash_code() == type.hash_code()) {
return DataType::INT32; return DataType::INT32;
} else if (typeid(int64_t).hash_code() == type.hash_code()) {
return DataType::INT64;
} else { } else {
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }

@ -84,8 +84,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
op->Run(local_scope, *device); op->Run(local_scope, *device);
} }
// TODO(tonyyang-svail): scope->DeleteScope(&local_scope);
// - Destroy local_scope
} }
} // namespace framework } // namespace framework

@ -21,28 +21,28 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename T> void SetFeedVariable(Scope* scope, const LoDTensor& input,
void SetFeedVariable(const LoDTensor& input, const std::string& var_name, const std::string& var_name, size_t index) {
size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will // If var_name Variable is not found in GlobalScope, a new variable will
// be created. // be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = GetGlobalScope().Var(var_name); Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = auto& feed_inputs =
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>()); *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
if (index >= feed_inputs.size()) { if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1); feed_inputs.resize(index + 1);
} }
// shared data with input tensor // shared data with input tensor
feed_inputs[index].ShareDataWith<T>(input); feed_inputs[index].ShareDataWith(input);
// set lod // set lod
feed_inputs[index].set_lod(input.lod()); feed_inputs[index].set_lod(input.lod());
} }
LoDTensor& GetFetchVariable(const std::string& var_name, size_t index) { LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) {
// Since we want to fetch LodTensor from a variable, the variable must // Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly. // be created alreadly.
Variable* g_fetch_value = GetGlobalScope().FindVar(var_name); Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(), PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable", "Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name()); typeid(FeedFetchList).name());

@ -25,31 +25,50 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
for (size_t i = level_begin; i < level_end; i++) { for (size_t i = level_begin; i < level_end; i++) {
new_lod.emplace_back(in.at(i)); new_lod.emplace_back(in.at(i));
} }
// transform the lowest level to absolute offset.
LoD abs_offset_lod = ToAbsOffset(in);
new_lod.back() = abs_offset_lod[level_end - 1];
return new_lod; return new_lod;
} }
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end) { size_t elem_end) {
// slice the lod. PADDLE_ENFORCE_LT(level, in.size());
LoD new_lod; PADDLE_ENFORCE_LT(elem_end, in[level].size());
new_lod.reserve(in.size() - level);
auto start = in.at(level)[elem_begin]; LoD res;
auto end = in.at(level)[elem_end]; res.resize(in.size() - level);
// copy the first level
for (auto it = in.begin() + level; it != in.end(); it++) { res[0].assign(in[level].begin() + elem_begin,
auto it_begin = std::find(it->begin(), it->end(), start); in[level].begin() + elem_end + 1);
auto it_end = std::find(it_begin, it->end(), end); for (size_t lvl = 1; lvl < res.size(); lvl++) {
PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info"); const auto& in_level = in[level + lvl];
PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info"); const auto& above_level = res[lvl - 1];
new_lod.emplace_back(it_begin, it_end + 1); auto& out_level = res[lvl];
// reset offset if tensor is copyed and sliced. out_level.assign(in_level.begin() + above_level.front(),
std::transform(new_lod.back().begin(), new_lod.back().end(), in_level.begin() + above_level.back() + 1);
new_lod.back().begin(),
[start](int v) { return v - start; });
PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD");
} }
PADDLE_ENFORCE_LE(new_lod.size(), in.size()); for (size_t lvl = 0; lvl < res.size(); lvl++) {
return new_lod; // to make the first offset equals 0, all the elements minus the first
// element
size_t front = res[lvl].front();
for (auto& ele : res[lvl]) {
ele -= front;
}
}
return res;
}
LoD ToAbsOffset(const LoD& in) {
// the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in;
LoD result = in;
for (int level = result.size() - 2; level >= 0; level--) {
for (auto& ele : result[level]) {
ele = result[level + 1][ele];
}
}
return result;
} }
bool operator==(const LoD& a, const LoD& b) { bool operator==(const LoD& a, const LoD& b) {
@ -75,17 +94,7 @@ bool operator==(const LoD& a, const LoD& b) {
size_t LoDTensor::NumElements(size_t level, size_t idx) const { size_t LoDTensor::NumElements(size_t level, size_t idx) const {
PADDLE_ENFORCE_LT(level, NumLevels()); PADDLE_ENFORCE_LT(level, NumLevels());
PADDLE_ENFORCE_LT(idx, NumElements(level)); PADDLE_ENFORCE_LT(idx, NumElements(level));
// the last level of LoD, just return number of records in Tensor return lod_[level][idx + 1] - lod_[level][idx];
if (level == NumLevels() - 1) {
return lod_[level][idx + 1] - lod_[level][idx];
}
// high level of LoD, and there is another lower level, return number of
// lower-level elements
auto tmp = SliceInLevel(lod_, level, idx, idx + 1);
PADDLE_ENFORCE_GE(tmp.size(), 2);
// there is a 0 as a placeholder stored in LoD, so the number of elements
// equals lod.size() - 1
return tmp[1].size() - 1;
} }
void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) { void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {

@ -39,23 +39,36 @@ using Vector = thrust::host_vector<
#endif #endif
/* /*
* 3-level LoD stores * LoD is short for Level of Details.
* *
* 0 10 20 * - in a level, each element indicates relative offset of the lower level
* 0 5 10 15 20
* 0 2 5 7 10 12 15 20
*
* - in a level, each element indicates offset in the underlying Tensor
* - the first element should be 0 and that indicates that this sequence start * - the first element should be 0 and that indicates that this sequence start
* from 0 * from 0
* - each sequence's begin and end(no-inclusive) is level[id, id+1] * - each sequence's begin and end(no-inclusive) is level[id, id+1]
*
* For example:
* 3-level LoD stores
*
* 0 2 3
* 0 2 4 7
* 0 2 5 7 10 12 15 20
*/ */
using LoD = std::vector<Vector<size_t>>; using LoD = std::vector<Vector<size_t>>;
/*
* Slice levels from a LoD.
* NOTE the lowest level should always be the absolute offsets of the underlying
* tensor instances. So if higher layers are sliced without the lowest level,
* the lower level of the sliced LoD will be transformed to the absolute offset.
*/
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end); LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end); size_t elem_end);
/*
* Transform an LoD from relative offsets to absolute offsets.
*/
LoD ToAbsOffset(const LoD& in);
bool operator==(const LoD& a, const LoD& b); bool operator==(const LoD& a, const LoD& b);

@ -30,8 +30,8 @@ class LoDTensorTester : public ::testing::Test {
// 0 5 10 15 20 // 0 5 10 15 20
// 0 2 5 7 10 12 15 20 // 0 2 5 7 10 12 15 20
LoD lod; LoD lod;
lod.push_back(std::vector<size_t>{0, 10, 20}); lod.push_back(std::vector<size_t>{0, 2, 3});
lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20}); lod.push_back(std::vector<size_t>{0, 2, 5, 8});
lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20}); lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
ASSERT_EQ(lod.size(), 3UL); ASSERT_EQ(lod.size(), 3UL);
@ -52,14 +52,14 @@ TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
TEST_F(LoDTensorTester, NumElements) { TEST_F(LoDTensorTester, NumElements) {
ASSERT_EQ(lod_tensor_.NumElements(0), 2UL); ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(1), 4UL); ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
ASSERT_EQ(lod_tensor_.NumElements(2), 8UL); ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
} }
TEST_F(LoDTensorTester, NumElements2) { TEST_F(LoDTensorTester, NumElements2) {
ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL); ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(0, 1), 2UL); ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
ASSERT_EQ(lod_tensor_.NumElements(1, 1), 2UL); ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
} }
TEST_F(LoDTensorTester, ShrinkLevels) { TEST_F(LoDTensorTester, ShrinkLevels) {
@ -68,17 +68,16 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
LoDTensor new_lod_tensor = lod_tensor_; LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkLevels(level, level + 1); new_lod_tensor.ShrinkLevels(level, level + 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL); ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>()); ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
} }
// shrink 2 level // shrink 2 level
for (size_t level = 0; level < 2UL; ++level) { for (size_t level = 0; level < 2UL; ++level) {
LoDTensor new_lod_tensor = lod_tensor_; LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkLevels(level, level + 2); new_lod_tensor.ShrinkLevels(level, level + 2);
// the lowest level's last element should be the tensor's batch_size.
ASSERT_EQ(new_lod_tensor.lod().back().back(),
lod_tensor_.lod().back().back());
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL); ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
ASSERT_EQ(new_lod_tensor.NumElements(1),
lod_tensor_.NumElements(level + 1));
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>()); ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
} }
} }
@ -86,19 +85,19 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
TEST_F(LoDTensorTester, ShrinkInLevel) { TEST_F(LoDTensorTester, ShrinkInLevel) {
size_t level = 0; size_t level = 0;
LoDTensor new_lod_tensor = lod_tensor_; LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkInLevel(level, 0, 2); new_lod_tensor.ShrinkInLevel(level, 0, 1);
EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL); EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL); EXPECT_EQ(new_lod_tensor.NumElements(0), 1UL);
EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL); EXPECT_EQ(new_lod_tensor.NumElements(1), 2UL);
EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL); EXPECT_EQ(new_lod_tensor.NumElements(2), 5UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>()); ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
level = 1; level = 1;
new_lod_tensor = lod_tensor_; new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkInLevel(level, 0, 2); new_lod_tensor.ShrinkInLevel(level, 1, 2);
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL); ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL); ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL); ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>()); ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
} }

@ -65,12 +65,11 @@ void Scope::DropKids() {
kids_.clear(); kids_.clear();
} }
framework::Scope& GetGlobalScope() { void Scope::DeleteScope(Scope* scope) {
static framework::Scope* g_scope = nullptr; auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
if (g_scope == nullptr) { PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
g_scope = new framework::Scope(); this->kids_.erase(it);
} delete scope;
return *g_scope;
} }
} // namespace framework } // namespace framework

@ -59,6 +59,8 @@ class Scope {
/// Find the scope or an ancestor scope that contains the given variable. /// Find the scope or an ancestor scope that contains the given variable.
const Scope* FindScope(const Variable* var) const; const Scope* FindScope(const Variable* var) const;
void DeleteScope(Scope* scope);
/// Drop all kids scopes belonged to this scope. /// Drop all kids scopes belonged to this scope.
void DropKids(); void DropKids();
@ -72,8 +74,5 @@ class Scope {
DISABLE_COPY_AND_ASSIGN(Scope); DISABLE_COPY_AND_ASSIGN(Scope);
}; };
framework::Scope& GetGlobalScope();
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle

@ -60,6 +60,10 @@ class Tensor {
template <typename T> template <typename T>
inline T* mutable_data(platform::Place place); inline T* mutable_data(platform::Place place);
inline void* mutable_data(platform::Place place, std::type_index type);
inline void* mutable_data(platform::Place place);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
* *
@ -81,7 +85,6 @@ class Tensor {
inline Tensor& Resize(const DDim& dims); inline Tensor& Resize(const DDim& dims);
/*! The internal of two tensors share the same memory block. */ /*! The internal of two tensors share the same memory block. */
template <typename T>
inline Tensor& ShareDataWith(const Tensor& src); inline Tensor& ShareDataWith(const Tensor& src);
/** /**
@ -96,26 +99,9 @@ class Tensor {
// TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647 // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
// Remove `CopyFrom` and `CopyFromVector` from Tensor interface // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
// and make them global functions // and make them global functions
template <typename T>
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx); const platform::DeviceContext& ctx);
// FIXME(yuyang18): CopyFrom should without template T, use the replace
// `CopyFrom` with `CopyFromTensor`
inline void CopyFromTensor(const Tensor& src,
const platform::Place& dst_place,
const platform::DeviceContext& ctx) {
// NOLINTNEXTLINES_8 cpplint.py will recognize below lines as functions.
// That is a bug of cpplint.py. Just ignore lint these lines.
if (src.type() == std::type_index(typeid(double))) {
CopyFrom<double>(src, dst_place, ctx);
} else if (src.type() == std::type_index(typeid(float))) {
CopyFrom<float>(src, dst_place, ctx);
} else if (src.type() == std::type_index(typeid(int))) {
CopyFrom<int>(src, dst_place, ctx);
}
}
/** /**
* @brief Copy the content of an external vector to a tensor. * @brief Copy the content of an external vector to a tensor.
* *
@ -135,7 +121,6 @@ class Tensor {
* @param[in] begin_idx The begin index of the slice. * @param[in] begin_idx The begin index of the slice.
* @param[in] end_idx The end index of the slice. * @param[in] end_idx The end index of the slice.
*/ */
template <typename T>
inline Tensor Slice(const int& begin_idx, const int& end_idx) const; inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
platform::Place place() const { platform::Place place() const {
@ -146,7 +131,6 @@ class Tensor {
std::type_index type() const { return holder_->type(); } std::type_index type() const { return holder_->type(); }
private: private:
template <typename T>
inline void check_memory_size() const; inline void check_memory_size() const;
private: private:
@ -155,20 +139,22 @@ class Tensor {
* parameter of Variable. * parameter of Variable.
*/ */
struct Placeholder { struct Placeholder {
virtual ~Placeholder() {} virtual ~Placeholder() = default;
virtual void* ptr() const = 0; virtual void* ptr() const = 0;
virtual size_t size() const = 0; virtual size_t size() const = 0;
virtual std::type_index type() const = 0; virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0; virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
}; };
template <typename T, typename Place> template <typename Place>
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size) PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<T*>(memory::Alloc(place, size)), : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<T, Place>(place)), memory::PODDeleter<uint8_t, Place>(place)),
place_(place), place_(place),
size_(size) { size_(size),
type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU")); (is_cpu_place(place_) ? "CPU" : "GPU"));
} }
@ -176,16 +162,20 @@ class Tensor {
virtual size_t size() const { return size_; } virtual size_t size() const { return size_; }
virtual platform::Place place() const { return place_; } virtual platform::Place place() const { return place_; }
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); } virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return std::type_index(typeid(T)); } virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
/*! the pointer of memory block. */ /*! the pointer of memory block. */
std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_; std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
/*! the place of memory block. */ /*! the place of memory block. */
platform::Place place_; platform::Place place_;
/*! the size of memory block. */ /*! the size of memory block. */
size_t size_; size_t size_;
/* the current type of memory */
std::type_index type_;
}; };
/*! holds the memory block if allocated. */ /*! holds the memory block if allocated. */

@ -106,8 +106,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) {
values_[index].Resize(value.dims()); values_[index].Resize(value.dims());
values_[index].mutable_data<value_type>(platform::CPUPlace()); values_[index].mutable_data<value_type>(platform::CPUPlace());
values_[index].CopyFrom<value_type>(value, platform::CPUPlace(), values_[index].CopyFrom(value, platform::CPUPlace(),
platform::CPUDeviceContext()); platform::CPUDeviceContext());
} }
void TensorArray::WriteShared(size_t index, const LoDTensor& value) { void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
@ -116,7 +116,7 @@ void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
values_.resize(index + 1); values_.resize(index + 1);
} }
values_[index].ShareDataWith<value_type>(value); values_[index].ShareDataWith(value);
} }
LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta, LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
@ -163,9 +163,9 @@ LoDTensor TensorArray::Stack() const {
result.mutable_data<value_type>(platform::CPUPlace()); result.mutable_data<value_type>(platform::CPUPlace());
for (size_t idx = 0; idx < size(); idx++) { for (size_t idx = 0; idx < size(); idx++) {
result.Slice<value_type>(idx, idx + 1) result.Slice(idx, idx + 1)
.CopyFrom<value_type>(Read(idx), platform::CPUPlace(), .CopyFrom(Read(idx), platform::CPUPlace(),
platform::CPUDeviceContext()); platform::CPUDeviceContext());
} }
return result; return result;
} }
@ -191,13 +191,12 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
auto& value = values_[elem]; auto& value = values_[elem];
if (data_shared) { if (data_shared) {
// share memory // share memory
value.ShareDataWith<value_type>(source.Slice<value_type>(elem, elem + 1)); value.ShareDataWith(source.Slice(elem, elem + 1));
} else { } else {
// copy // copy
value.Resize(value_dims); value.Resize(value_dims);
value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1), value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
platform::CPUPlace(), platform::CPUDeviceContext());
platform::CPUDeviceContext());
} }
} }
} }
@ -242,11 +241,10 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
for (size_t i = 0; i < indice.size(); i++) { for (size_t i = 0; i < indice.size(); i++) {
auto index = indice[i]; auto index = indice[i];
auto target = result.Slice<value_type>(i, i + 1); auto target = result.Slice(i, i + 1);
auto slice = source->Slice<value_type>(index, index + 1); auto slice = source->Slice(index, index + 1);
target.CopyFrom<value_type>(slice, platform::CPUPlace(), target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
platform::CPUDeviceContext());
} }
return result; return result;
@ -277,10 +275,10 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
// target is result[index] // target is result[index]
auto index = seq_meta.begin + batch_id; auto index = seq_meta.begin + batch_id;
if (index >= seq_meta.end) break; if (index >= seq_meta.end) break;
auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1); auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
auto target = result.Slice<float>(index, index + 1); auto target = result.Slice(index, index + 1);
target.CopyFrom<float>(source_, platform::CPUPlace(), target.CopyFrom(source_, platform::CPUPlace(),
platform::CPUDeviceContext()); platform::CPUDeviceContext());
} }
} }

@ -91,7 +91,7 @@ class TensorArrayPackTester : public ::testing::Test {
size_t begin = level[i]; size_t begin = level[i];
size_t end = level[i + 1]; size_t end = level[i + 1];
for (size_t j = begin; j < end; j++) { for (size_t j = begin; j < end; j++) {
auto record = source.Slice<int>(j, j + 1); auto record = source.Slice(j, j + 1);
for (int dim = 0; dim < 128; dim++) { for (int dim = 0; dim < 128; dim++) {
record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin; record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
} }

@ -19,12 +19,50 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename... T>
struct SizeOfTypeFunctor;
template <typename T> template <typename T>
struct SizeOfTypeFunctor<T> {
size_t operator()(std::type_index type) const {
if (typeid(T).hash_code() == type.hash_code()) {
return sizeof(T);
} else {
return 0UL;
}
}
};
template <>
struct SizeOfTypeFunctor<> {
size_t operator()(std::type_index type) const { return 0UL; }
};
template <typename HEAD, typename... TAIL>
struct SizeOfTypeFunctor<HEAD, TAIL...> {
size_t operator()(std::type_index type) const {
SizeOfTypeFunctor<HEAD> head;
size_t head_size = head(type);
if (head_size != 0) {
return head_size;
}
SizeOfTypeFunctor<TAIL...> tail;
return tail(type);
}
};
static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
size_t size = functor(type);
PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
return size;
}
inline void Tensor::check_memory_size() const { inline void Tensor::check_memory_size() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
holder_->size(), numel() * sizeof(T) + offset_, holder_->size(), numel() * SizeOfType(type()) + offset_,
"Tensor's dims_ is out of bound. Call Tensor::mutable_data " "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
"first to re-allocate memory.\n" "first to re-allocate memory.\n"
"or maybe the required data-type mismatches the data already stored."); "or maybe the required data-type mismatches the data already stored.");
@ -32,14 +70,23 @@ inline void Tensor::check_memory_size() const {
template <typename T> template <typename T>
inline const T* Tensor::data() const { inline const T* Tensor::data() const {
check_memory_size<T>(); check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code(),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<const T*>( return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_); reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
} }
template <typename T> template <typename T>
inline T* Tensor::data() { inline T* Tensor::data() {
check_memory_size<T>(); check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code(),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
@ -54,51 +101,62 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
template <typename T> template <typename T>
inline T* Tensor::mutable_data(platform::Place place) { inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
}
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
PADDLE_ENFORCE_GT(numel(), 0, PADDLE_ENFORCE_GT(numel(), 0,
"Tensor's numel must be larger than zero to call " "Tensor's numel must be larger than zero to call "
"Tensor::mutable_data. Call Tensor::set_dim first."); "Tensor::mutable_data. Call Tensor::set_dim first.");
int64_t size = numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
int64_t size = numel() * sizeof(T);
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>( holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size)); boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
} }
#else #else
holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>( holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), size)); boost::get<platform::GPUPlace>(place), size, type));
} }
#endif #endif
offset_ = 0; offset_ = 0;
} }
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
}
inline void* Tensor::mutable_data(platform::Place place) {
PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type());
} }
template <typename T>
inline Tensor& Tensor::ShareDataWith(const Tensor& src) { inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
src.check_memory_size<T>(); src.check_memory_size();
*this = src; *this = src;
return *this; return *this;
} }
template <typename T>
inline void Tensor::CopyFrom(const Tensor& src, inline void Tensor::CopyFrom(const Tensor& src,
const platform::Place& dst_place, const platform::Place& dst_place,
const platform::DeviceContext& ctx) { const platform::DeviceContext& ctx) {
src.check_memory_size<T>(); src.check_memory_size();
Resize(src.dims()); Resize(src.dims());
auto src_place = src.holder_->place(); auto src_place = src.holder_->place();
auto src_ptr = static_cast<const void*>(src.data<T>()); auto src_ptr = src.data<void>();
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place)); auto dst_ptr = mutable_data(dst_place, src.type());
auto size = src.numel() * sizeof(T); auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@ -165,9 +223,8 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,
#endif #endif
} }
template <typename T>
inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
check_memory_size<T>(); check_memory_size();
PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero."); PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound."); PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
PADDLE_ENFORCE_LT(begin_idx, end_idx, PADDLE_ENFORCE_LT(begin_idx, end_idx,
@ -182,7 +239,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
DDim dst_dims = dims_; DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx; dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims); dst.Resize(dst_dims);
dst.offset_ = offset_ + begin_idx * base * sizeof(T); dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
return dst; return dst;
} }
} }
@ -196,10 +253,9 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); } inline int64_t Tensor::numel() const { return product(dims_); }
template <typename T>
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res; Tensor res;
res.ShareDataWith<T>(src); res.ShareDataWith(src);
res.Resize(flatten_to_2d(src.dims(), num_col_dims)); res.Resize(flatten_to_2d(src.dims(), num_col_dims));
return res; return res;
} }

@ -108,7 +108,7 @@ TEST(Tensor, ShareDataWith) {
// Try to share data form uninitialized tensor // Try to share data form uninitialized tensor
bool caught = false; bool caught = false;
try { try {
dst_tensor.ShareDataWith<float>(src_tensor); dst_tensor.ShareDataWith(src_tensor);
} catch (paddle::platform::EnforceNotMet err) { } catch (paddle::platform::EnforceNotMet err) {
caught = true; caught = true;
std::string msg = std::string msg =
@ -122,7 +122,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_TRUE(caught); ASSERT_TRUE(caught);
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace()); src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
dst_tensor.ShareDataWith<int>(src_tensor); dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
@ -131,7 +131,7 @@ TEST(Tensor, ShareDataWith) {
Tensor src_tensor; Tensor src_tensor;
Tensor dst_tensor; Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace()); src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
dst_tensor.ShareDataWith<int>(src_tensor); dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
#endif #endif
@ -143,7 +143,7 @@ TEST(Tensor, Slice) {
{ {
Tensor src_tensor; Tensor src_tensor;
src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace()); src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
Tensor slice_tensor = src_tensor.Slice<int>(1, 3); Tensor slice_tensor = src_tensor.Slice(1, 3);
DDim slice_dims = slice_tensor.dims(); DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 3); ASSERT_EQ(arity(slice_dims), 3);
EXPECT_EQ(slice_dims[0], 2); EXPECT_EQ(slice_dims[0], 2);
@ -167,7 +167,7 @@ TEST(Tensor, Slice) {
{ {
Tensor src_tensor; Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace()); src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
Tensor slice_tensor = src_tensor.Slice<double>(2, 6); Tensor slice_tensor = src_tensor.Slice(2, 6);
DDim slice_dims = slice_tensor.dims(); DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2); ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4); EXPECT_EQ(slice_dims[0], 4);
@ -202,7 +202,7 @@ TEST(Tensor, CopyFrom) {
memcpy(src_ptr, arr, 9 * sizeof(int)); memcpy(src_ptr, arr, 9 * sizeof(int));
auto cpu_place = new paddle::platform::CPUPlace(); auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(src_tensor, *cpu_place, cpu_ctx); dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
const int* dst_ptr = dst_tensor.data<int>(); const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr); ASSERT_NE(src_ptr, dst_ptr);
@ -210,8 +210,8 @@ TEST(Tensor, CopyFrom) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]); EXPECT_EQ(src_ptr[i], dst_ptr[i]);
} }
Tensor slice_tensor = src_tensor.Slice<int>(1, 2); Tensor slice_tensor = src_tensor.Slice(1, 2);
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place, cpu_ctx); dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
const int* slice_ptr = slice_tensor.data<int>(); const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>(); dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr); ASSERT_NE(dst_ptr, slice_ptr);
@ -233,11 +233,11 @@ TEST(Tensor, CopyFrom) {
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_place = new paddle::platform::GPUPlace(0); auto gpu_place = new paddle::platform::GPUPlace(0);
CUDADeviceContext gpu_ctx(*gpu_place); CUDADeviceContext gpu_ctx(*gpu_place);
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place, gpu_ctx); gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
auto cpu_place = new paddle::platform::CPUPlace(); auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx); dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors // Sync before Compare Tensors
gpu_ctx.Wait(); gpu_ctx.Wait();
@ -247,13 +247,13 @@ TEST(Tensor, CopyFrom) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]); EXPECT_EQ(src_ptr[i], dst_ptr[i]);
} }
Tensor slice_tensor = src_tensor.Slice<int>(1, 2); Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor // CPU Slice Tensor to GPU Tensor
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place, gpu_ctx); gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx); dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Slice Tensors // Sync before Compare Slice Tensors
gpu_ctx.Wait(); gpu_ctx.Wait();
@ -320,7 +320,7 @@ TEST(Tensor, CopyFromVector) {
CUDADeviceContext gpu_ctx(*gpu_place); CUDADeviceContext gpu_ctx(*gpu_place);
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx); gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
// Copy from GPU to CPU tensor for comparison // Copy from GPU to CPU tensor for comparison
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx); dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors // Sync before Compare Tensors
gpu_ctx.Wait(); gpu_ctx.Wait();
@ -340,7 +340,7 @@ TEST(Tensor, CopyFromVector) {
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx); cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
gpu_tensor.Resize(make_ddim({2, 2})); gpu_tensor.Resize(make_ddim({2, 2}));
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx); gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx); dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors // Sync before Compare Tensors
gpu_ctx.Wait(); gpu_ctx.Wait();
@ -368,7 +368,7 @@ TEST(Tensor, ReshapeToMatrix) {
for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
src_ptr[i] = i; src_ptr[i] = i;
} }
Tensor res = ReshapeToMatrix<int>(src, 2); Tensor res = ReshapeToMatrix(src, 2);
ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[0], 2 * 3);
ASSERT_EQ(res.dims()[1], 4 * 9); ASSERT_EQ(res.dims()[1], 4 * 9);
} }

@ -0,0 +1,58 @@
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
settings(batch_size=16)
channels = get_config_arg("channels", int, 2)
def two_fc(input, group_name):
out1 = fc_layer(input=input,
name=group_name+'_fc1',
size=channels,
bias_attr=False,
act=LinearActivation())
out2 = fc_layer(input=input,
name=group_name+'_fc2',
size=channels,
bias_attr=False,
act=LinearActivation())
return out1, out2
data = data_layer(name ="input", size=channels*16*16)
conv = img_conv_layer(input=data,
num_channels=channels,
filter_size=3,
num_filters=channels,
padding=1,
shared_biases=True,
act=LinearActivation())
pool = img_pool_layer(input=conv,
pool_size=3,
stride=2,
padding=1,
pool_type=AvgPooling())
a1, a2 = two_fc(input=pool, group_name='a')
concat = concat_layer(input=[a1, a2])
b1, b2 = two_fc(input=pool, group_name='b')
addto = addto_layer(input=[b1, b2])
outputs([concat, addto])

@ -0,0 +1,60 @@
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
settings(batch_size=16)
channels = get_config_arg("channels", int, 2)
def two_pool(input, group_name):
out1 = img_pool_layer(input=input,
name=group_name+'_pool1',
pool_size=3,
stride=2,
padding=0,
pool_type=MaxPooling())
out2 = img_pool_layer(input=input,
name=group_name+'_pool2',
pool_size=5,
stride=2,
padding=1,
pool_type=MaxPooling())
return out1, out2
data = data_layer(name ="input", size=channels*16*16)
conv = img_conv_layer(input=data,
num_channels=channels,
filter_size=3,
num_filters=channels,
padding=1,
shared_biases=True,
act=LinearActivation())
pool = img_pool_layer(input=conv,
pool_size=3,
stride=1,
padding=1,
pool_type=AvgPooling())
a1, a2 = two_pool(input=pool, group_name='a')
concat = concat_layer(input=[a1, a2])
b1, b2 = two_pool(input=pool, group_name='b')
addto = addto_layer(input=[b1, b2])
outputs([concat, addto])

@ -250,7 +250,7 @@ TEST(MKLDNNActivation, Activations) {
DECLARE_string(config_args); DECLARE_string(config_args);
TEST(MKLDNNLayer, branches) { TEST(MKLDNNLayer, branches) {
std::vector<std::string> cases = {"conv"}; std::vector<std::string> cases = {"conv", "pool", "fc"};
for (auto name : cases) { for (auto name : cases) {
std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf"; std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf";
for (auto channels : {2, 32}) { for (auto channels : {2, 32}) {

@ -69,5 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`.
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker); REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
REGISTER_OP_CPU_KERNEL(accuracy, REGISTER_OP_CPU_KERNEL(
ops::AccuracyKernel<paddle::platform::CPUPlace, float>); accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
ops::AccuracyKernel<paddle::platform::CPUPlace, double>,
ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);

@ -21,9 +21,9 @@ namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using platform::PADDLE_CUDA_NUM_THREADS;
template <int BlockSize> template <typename T, int BlockSize>
__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata, __global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata,
const int* labeldata, float* accuracy) { const T* labeldata, float* accuracy) {
int count = 0; int count = 0;
__shared__ int total[BlockSize]; __shared__ int total[BlockSize];
@ -57,8 +57,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
auto* accuracy = ctx.Output<Tensor>("Accuracy"); auto* accuracy = ctx.Output<Tensor>("Accuracy");
// FIXME(typhoonzero): only support indices currently // FIXME(typhoonzero): only support indices currently
// if add support for output values, how to detect the data type? // if add support for output values, how to detect the data type?
const int* inference_data = inference->data<int>(); const T* inference_data = inference->data<T>();
const int* label_data = label->data<int>(); const T* label_data = label->data<T>();
float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace()); float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
size_t num_samples = inference->dims()[0]; size_t num_samples = inference->dims()[0];
@ -69,7 +69,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
return; return;
} }
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<< AccuracyCudaKernel<T, PADDLE_CUDA_NUM_THREADS><<<
1, PADDLE_CUDA_NUM_THREADS, 0, 1, PADDLE_CUDA_NUM_THREADS, 0,
reinterpret_cast<const platform::CUDADeviceContext&>( reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context()) ctx.device_context())
@ -81,5 +81,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_GPU_KERNEL(accuracy, REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<float>); paddle::operators::AccuracyOpCUDAKernel<double>,
paddle::operators::AccuracyOpCUDAKernel<int>,
paddle::operators::AccuracyOpCUDAKernel<int64_t>);

@ -108,17 +108,17 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
int in_step = input_channels / groups; int in_step = input_channels / groups;
int out_step = output_channels / groups; int out_step = output_channels / groups;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice<T>(i, i + 1).Resize(output_matrix_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
// im2col // im2col
Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], strides[1], im2col(context.device_context(), in_slice, col, strides[0], strides[1],
paddings[0], paddings[1]); paddings[0], paddings[1]);
// gemm // gemm
Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), filter_slice, false, math::matmul<Place, T>(context.device_context(), filter_slice, false,
col_matrix, false, T(1.0), &out_slice, T(0.0)); col_matrix, false, T(1.0), &out_slice, T(0.0));
} }
@ -198,22 +198,20 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch = Tensor out_grad_batch =
output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape); output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor in_grad_batch = Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
input_grad->Slice<T>(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
// gemm // gemm
Tensor out_grad_slice = Tensor out_grad_slice =
out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step); out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
filter.Slice<T>(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), filter_slice, true, math::matmul<Place, T>(context.device_context(), filter_slice, true,
out_grad_slice, false, T(1.0), &col_matrix, out_grad_slice, false, T(1.0), &col_matrix,
T(0.0)); T(0.0));
// col2im // col2im
Tensor in_grad_slice = Tensor in_grad_slice =
in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step); in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
col2im(context.device_context(), in_grad_slice, col, strides[0], col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[1]); strides[1], paddings[0], paddings[1]);
} }
@ -229,19 +227,19 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch = Tensor out_grad_batch =
output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape); output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
// im2col // im2col
Tensor out_grad_slice = Tensor out_grad_slice =
out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step); out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[1]); strides[1], paddings[0], paddings[1]);
// gemm // gemm
Tensor filter_grad_slice = Tensor filter_grad_slice =
filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step); filter_grad_.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), out_grad_slice, math::matmul<Place, T>(context.device_context(), out_grad_slice,
false, col_matrix, true, T(1.0), false, col_matrix, true, T(1.0),
&filter_grad_slice, T(1.0)); &filter_grad_slice, T(1.0));

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save