Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

revert-4814-Add_sequence_project_op
zchen0211 7 years ago
commit 502e72591f

@ -189,7 +189,7 @@ OpDesc {
inputs = {0} // the index of x in vars of BlockDesc above
outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
attrs {
"memories" : {1} // the index of h
"states" : {1} // the index of h
"step_net" : <above step net>
}
};

@ -28,23 +28,37 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
add_dependencies(paddle_capi paddle_proto)
# combine all paddle static libraries together, into libpaddle_capi_whole.a
# user should use PaddleCAPI as -lpaddle_capi_whole
set(PADDLE_CAPI_INFER_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_function
paddle_gserver
paddle_proto)
# TODO: paddle_capi_whole will be removed.
if(MOBILE_INFERENCE)
set(PADDLE_CAPI_INFER_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_function
paddle_gserver
paddle_proto)
else()
set(PADDLE_CAPI_INFER_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_function
paddle_gserver
paddle_proto
paddle_pserver
paddle_network)
endif()
cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
# No shared library for iOS
# Link the static library for inference
cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
# Link the shared library for inference
if(NOT IOS)
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
# TODO: merge mkl into paddle_capi_shared
set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
@ -53,9 +67,10 @@ endif()
# install library & headers.
install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
install(FILES paddle_capi.map DESTINATION include/paddle)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
if(ANDROID)
install(TARGETS paddle_capi_whole paddle_capi_shared
install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
ARCHIVE DESTINATION lib/${ANDROID_ABI}
LIBRARY DESTINATION lib/${ANDROID_ABI})
execute_process(
@ -80,7 +95,7 @@ if(ANDROID)
)"
)
else(ANDROID)
install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
if(NOT IOS)
install(TARGETS paddle_capi_shared DESTINATION lib)
endif()

@ -21,6 +21,7 @@
#include "paddle/framework/block_desc.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/dynamic_recurrent_op.h"
#include "paddle/operators/net_op.h"
#include "paddle/operators/recurrent_op.h"
@ -220,8 +221,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
// process recurrent gradient op as a special operator.
if (forwardOp.Type() == "recurrent") {
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
// or
// this will result in infinite loop.
// or this will result in infinite loop.
const auto& rnnop =
*static_cast<const operators::RecurrentOp*>(&forwardOp);
auto rnn_grad_op =
@ -231,6 +231,18 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
// create stepnet's gradient op
rnn_grad_op->set_stepnet(
BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
} else if (forwardOp.Type() == "dynamic_recurrent") {
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
// or this will result in infinite loop.
const auto& rnnop =
*static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
auto rnn_grad_op =
static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
const auto& stepnet_op =
*static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
// create stepnet's gradient op
rnn_grad_op->rnn.SetStepUnit(
BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
}
if (net->ops_.empty()) { // Current no aux op is added to network

@ -26,6 +26,8 @@ inline DataType ToDataType(std::type_index type) {
return DataType::FP64;
} else if (typeid(int).hash_code() == type.hash_code()) {
return DataType::INT32;
} else if (typeid(int64_t).hash_code() == type.hash_code()) {
return DataType::INT64;
} else {
PADDLE_THROW("Not supported");
}

@ -84,8 +84,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
op->Run(local_scope, *device);
}
// TODO(tonyyang-svail):
// - Destroy local_scope
scope->DeleteScope(&local_scope);
}
} // namespace framework

@ -21,28 +21,28 @@ limitations under the License. */
namespace paddle {
namespace framework {
template <typename T>
void SetFeedVariable(const LoDTensor& input, const std::string& var_name,
size_t index) {
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = GetGlobalScope().Var(var_name);
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs =
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith<T>(input);
feed_inputs[index].ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
}
LoDTensor& GetFetchVariable(const std::string& var_name, size_t index) {
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) {
// Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly.
Variable* g_fetch_value = GetGlobalScope().FindVar(var_name);
Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name());

@ -25,31 +25,50 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
for (size_t i = level_begin; i < level_end; i++) {
new_lod.emplace_back(in.at(i));
}
// transform the lowest level to absolute offset.
LoD abs_offset_lod = ToAbsOffset(in);
new_lod.back() = abs_offset_lod[level_end - 1];
return new_lod;
}
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end) {
// slice the lod.
LoD new_lod;
new_lod.reserve(in.size() - level);
auto start = in.at(level)[elem_begin];
auto end = in.at(level)[elem_end];
for (auto it = in.begin() + level; it != in.end(); it++) {
auto it_begin = std::find(it->begin(), it->end(), start);
auto it_end = std::find(it_begin, it->end(), end);
PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
new_lod.emplace_back(it_begin, it_end + 1);
// reset offset if tensor is copyed and sliced.
std::transform(new_lod.back().begin(), new_lod.back().end(),
new_lod.back().begin(),
[start](int v) { return v - start; });
PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD");
PADDLE_ENFORCE_LT(level, in.size());
PADDLE_ENFORCE_LT(elem_end, in[level].size());
LoD res;
res.resize(in.size() - level);
// copy the first level
res[0].assign(in[level].begin() + elem_begin,
in[level].begin() + elem_end + 1);
for (size_t lvl = 1; lvl < res.size(); lvl++) {
const auto& in_level = in[level + lvl];
const auto& above_level = res[lvl - 1];
auto& out_level = res[lvl];
out_level.assign(in_level.begin() + above_level.front(),
in_level.begin() + above_level.back() + 1);
}
PADDLE_ENFORCE_LE(new_lod.size(), in.size());
return new_lod;
for (size_t lvl = 0; lvl < res.size(); lvl++) {
// to make the first offset equals 0, all the elements minus the first
// element
size_t front = res[lvl].front();
for (auto& ele : res[lvl]) {
ele -= front;
}
}
return res;
}
LoD ToAbsOffset(const LoD& in) {
// the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in;
LoD result = in;
for (int level = result.size() - 2; level >= 0; level--) {
for (auto& ele : result[level]) {
ele = result[level + 1][ele];
}
}
return result;
}
bool operator==(const LoD& a, const LoD& b) {
@ -75,17 +94,7 @@ bool operator==(const LoD& a, const LoD& b) {
size_t LoDTensor::NumElements(size_t level, size_t idx) const {
PADDLE_ENFORCE_LT(level, NumLevels());
PADDLE_ENFORCE_LT(idx, NumElements(level));
// the last level of LoD, just return number of records in Tensor
if (level == NumLevels() - 1) {
return lod_[level][idx + 1] - lod_[level][idx];
}
// high level of LoD, and there is another lower level, return number of
// lower-level elements
auto tmp = SliceInLevel(lod_, level, idx, idx + 1);
PADDLE_ENFORCE_GE(tmp.size(), 2);
// there is a 0 as a placeholder stored in LoD, so the number of elements
// equals lod.size() - 1
return tmp[1].size() - 1;
return lod_[level][idx + 1] - lod_[level][idx];
}
void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {

@ -39,23 +39,36 @@ using Vector = thrust::host_vector<
#endif
/*
* 3-level LoD stores
* LoD is short for Level of Details.
*
* 0 10 20
* 0 5 10 15 20
* 0 2 5 7 10 12 15 20
*
* - in a level, each element indicates offset in the underlying Tensor
* - in a level, each element indicates relative offset of the lower level
* - the first element should be 0 and that indicates that this sequence start
* from 0
* - each sequence's begin and end(no-inclusive) is level[id, id+1]
*
* For example:
* 3-level LoD stores
*
* 0 2 3
* 0 2 4 7
* 0 2 5 7 10 12 15 20
*/
using LoD = std::vector<Vector<size_t>>;
/*
* Slice levels from a LoD.
* NOTE the lowest level should always be the absolute offsets of the underlying
* tensor instances. So if higher layers are sliced without the lowest level,
* the lower level of the sliced LoD will be transformed to the absolute offset.
*/
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end);
/*
* Transform an LoD from relative offsets to absolute offsets.
*/
LoD ToAbsOffset(const LoD& in);
bool operator==(const LoD& a, const LoD& b);

@ -30,8 +30,8 @@ class LoDTensorTester : public ::testing::Test {
// 0 5 10 15 20
// 0 2 5 7 10 12 15 20
LoD lod;
lod.push_back(std::vector<size_t>{0, 10, 20});
lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
lod.push_back(std::vector<size_t>{0, 2, 3});
lod.push_back(std::vector<size_t>{0, 2, 5, 8});
lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
ASSERT_EQ(lod.size(), 3UL);
@ -52,14 +52,14 @@ TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
TEST_F(LoDTensorTester, NumElements) {
ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(1), 4UL);
ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
}
TEST_F(LoDTensorTester, NumElements2) {
ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(0, 1), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(1, 1), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
}
TEST_F(LoDTensorTester, ShrinkLevels) {
@ -68,17 +68,16 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkLevels(level, level + 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
}
// shrink 2 level
for (size_t level = 0; level < 2UL; ++level) {
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkLevels(level, level + 2);
// the lowest level's last element should be the tensor's batch_size.
ASSERT_EQ(new_lod_tensor.lod().back().back(),
lod_tensor_.lod().back().back());
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
ASSERT_EQ(new_lod_tensor.NumElements(1),
lod_tensor_.NumElements(level + 1));
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
}
}
@ -86,19 +85,19 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
TEST_F(LoDTensorTester, ShrinkInLevel) {
size_t level = 0;
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkInLevel(level, 0, 2);
new_lod_tensor.ShrinkInLevel(level, 0, 1);
EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
EXPECT_EQ(new_lod_tensor.NumElements(0), 1UL);
EXPECT_EQ(new_lod_tensor.NumElements(1), 2UL);
EXPECT_EQ(new_lod_tensor.NumElements(2), 5UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
level = 1;
new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkInLevel(level, 0, 2);
new_lod_tensor.ShrinkInLevel(level, 1, 2);
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
}

@ -65,12 +65,11 @@ void Scope::DropKids() {
kids_.clear();
}
framework::Scope& GetGlobalScope() {
static framework::Scope* g_scope = nullptr;
if (g_scope == nullptr) {
g_scope = new framework::Scope();
}
return *g_scope;
void Scope::DeleteScope(Scope* scope) {
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it);
delete scope;
}
} // namespace framework

@ -59,6 +59,8 @@ class Scope {
/// Find the scope or an ancestor scope that contains the given variable.
const Scope* FindScope(const Variable* var) const;
void DeleteScope(Scope* scope);
/// Drop all kids scopes belonged to this scope.
void DropKids();
@ -72,8 +74,5 @@ class Scope {
DISABLE_COPY_AND_ASSIGN(Scope);
};
framework::Scope& GetGlobalScope();
} // namespace framework
} // namespace paddle

@ -60,6 +60,10 @@ class Tensor {
template <typename T>
inline T* mutable_data(platform::Place place);
inline void* mutable_data(platform::Place place, std::type_index type);
inline void* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
*
@ -81,7 +85,6 @@ class Tensor {
inline Tensor& Resize(const DDim& dims);
/*! The internal of two tensors share the same memory block. */
template <typename T>
inline Tensor& ShareDataWith(const Tensor& src);
/**
@ -96,26 +99,9 @@ class Tensor {
// TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
// Remove `CopyFrom` and `CopyFromVector` from Tensor interface
// and make them global functions
template <typename T>
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx);
// FIXME(yuyang18): CopyFrom should without template T, use the replace
// `CopyFrom` with `CopyFromTensor`
inline void CopyFromTensor(const Tensor& src,
const platform::Place& dst_place,
const platform::DeviceContext& ctx) {
// NOLINTNEXTLINES_8 cpplint.py will recognize below lines as functions.
// That is a bug of cpplint.py. Just ignore lint these lines.
if (src.type() == std::type_index(typeid(double))) {
CopyFrom<double>(src, dst_place, ctx);
} else if (src.type() == std::type_index(typeid(float))) {
CopyFrom<float>(src, dst_place, ctx);
} else if (src.type() == std::type_index(typeid(int))) {
CopyFrom<int>(src, dst_place, ctx);
}
}
/**
* @brief Copy the content of an external vector to a tensor.
*
@ -135,7 +121,6 @@ class Tensor {
* @param[in] begin_idx The begin index of the slice.
* @param[in] end_idx The end index of the slice.
*/
template <typename T>
inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
platform::Place place() const {
@ -146,7 +131,6 @@ class Tensor {
std::type_index type() const { return holder_->type(); }
private:
template <typename T>
inline void check_memory_size() const;
private:
@ -155,20 +139,22 @@ class Tensor {
* parameter of Variable.
*/
struct Placeholder {
virtual ~Placeholder() {}
virtual ~Placeholder() = default;
virtual void* ptr() const = 0;
virtual size_t size() const = 0;
virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
};
template <typename T, typename Place>
template <typename Place>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size)
: ptr_(static_cast<T*>(memory::Alloc(place, size)),
memory::PODDeleter<T, Place>(place)),
PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<uint8_t, Place>(place)),
place_(place),
size_(size) {
size_(size),
type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
@ -176,16 +162,20 @@ class Tensor {
virtual size_t size() const { return size_; }
virtual platform::Place place() const { return place_; }
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return std::type_index(typeid(T)); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
/*! the pointer of memory block. */
std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
/*! the place of memory block. */
platform::Place place_;
/*! the size of memory block. */
size_t size_;
/* the current type of memory */
std::type_index type_;
};
/*! holds the memory block if allocated. */

@ -106,8 +106,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) {
values_[index].Resize(value.dims());
values_[index].mutable_data<value_type>(platform::CPUPlace());
values_[index].CopyFrom<value_type>(value, platform::CPUPlace(),
platform::CPUDeviceContext());
values_[index].CopyFrom(value, platform::CPUPlace(),
platform::CPUDeviceContext());
}
void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
@ -116,7 +116,7 @@ void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
values_.resize(index + 1);
}
values_[index].ShareDataWith<value_type>(value);
values_[index].ShareDataWith(value);
}
LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
@ -163,9 +163,9 @@ LoDTensor TensorArray::Stack() const {
result.mutable_data<value_type>(platform::CPUPlace());
for (size_t idx = 0; idx < size(); idx++) {
result.Slice<value_type>(idx, idx + 1)
.CopyFrom<value_type>(Read(idx), platform::CPUPlace(),
platform::CPUDeviceContext());
result.Slice(idx, idx + 1)
.CopyFrom(Read(idx), platform::CPUPlace(),
platform::CPUDeviceContext());
}
return result;
}
@ -191,13 +191,12 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
auto& value = values_[elem];
if (data_shared) {
// share memory
value.ShareDataWith<value_type>(source.Slice<value_type>(elem, elem + 1));
value.ShareDataWith(source.Slice(elem, elem + 1));
} else {
// copy
value.Resize(value_dims);
value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
platform::CPUPlace(),
platform::CPUDeviceContext());
value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
platform::CPUDeviceContext());
}
}
}
@ -242,11 +241,10 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
for (size_t i = 0; i < indice.size(); i++) {
auto index = indice[i];
auto target = result.Slice<value_type>(i, i + 1);
auto slice = source->Slice<value_type>(index, index + 1);
auto target = result.Slice(i, i + 1);
auto slice = source->Slice(index, index + 1);
target.CopyFrom<value_type>(slice, platform::CPUPlace(),
platform::CPUDeviceContext());
target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
}
return result;
@ -277,10 +275,10 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
// target is result[index]
auto index = seq_meta.begin + batch_id;
if (index >= seq_meta.end) break;
auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
auto target = result.Slice<float>(index, index + 1);
target.CopyFrom<float>(source_, platform::CPUPlace(),
platform::CPUDeviceContext());
auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
auto target = result.Slice(index, index + 1);
target.CopyFrom(source_, platform::CPUPlace(),
platform::CPUDeviceContext());
}
}

@ -91,7 +91,7 @@ class TensorArrayPackTester : public ::testing::Test {
size_t begin = level[i];
size_t end = level[i + 1];
for (size_t j = begin; j < end; j++) {
auto record = source.Slice<int>(j, j + 1);
auto record = source.Slice(j, j + 1);
for (int dim = 0; dim < 128; dim++) {
record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
}

@ -19,12 +19,50 @@ limitations under the License. */
namespace paddle {
namespace framework {
template <typename... T>
struct SizeOfTypeFunctor;
template <typename T>
struct SizeOfTypeFunctor<T> {
size_t operator()(std::type_index type) const {
if (typeid(T).hash_code() == type.hash_code()) {
return sizeof(T);
} else {
return 0UL;
}
}
};
template <>
struct SizeOfTypeFunctor<> {
size_t operator()(std::type_index type) const { return 0UL; }
};
template <typename HEAD, typename... TAIL>
struct SizeOfTypeFunctor<HEAD, TAIL...> {
size_t operator()(std::type_index type) const {
SizeOfTypeFunctor<HEAD> head;
size_t head_size = head(type);
if (head_size != 0) {
return head_size;
}
SizeOfTypeFunctor<TAIL...> tail;
return tail(type);
}
};
static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
size_t size = functor(type);
PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
return size;
}
inline void Tensor::check_memory_size() const {
PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE_GE(
holder_->size(), numel() * sizeof(T) + offset_,
holder_->size(), numel() * SizeOfType(type()) + offset_,
"Tensor's dims_ is out of bound. Call Tensor::mutable_data "
"first to re-allocate memory.\n"
"or maybe the required data-type mismatches the data already stored.");
@ -32,14 +70,23 @@ inline void Tensor::check_memory_size() const {
template <typename T>
inline const T* Tensor::data() const {
check_memory_size<T>();
check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code(),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
template <typename T>
inline T* Tensor::data() {
check_memory_size<T>();
check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code(),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
@ -54,51 +101,62 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
template <typename T>
inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
}
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
PADDLE_ENFORCE_GT(numel(), 0,
"Tensor's numel must be larger than zero to call "
"Tensor::mutable_data. Call Tensor::set_dim first.");
int64_t size = numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */
int64_t size = numel() * sizeof(T);
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size));
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
}
#else
holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), size));
holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), size, type));
}
#endif
offset_ = 0;
}
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
inline void* Tensor::mutable_data(platform::Place place) {
PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type());
}
template <typename T>
inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
src.check_memory_size<T>();
src.check_memory_size();
*this = src;
return *this;
}
template <typename T>
inline void Tensor::CopyFrom(const Tensor& src,
const platform::Place& dst_place,
const platform::DeviceContext& ctx) {
src.check_memory_size<T>();
src.check_memory_size();
Resize(src.dims());
auto src_place = src.holder_->place();
auto src_ptr = static_cast<const void*>(src.data<T>());
auto src_ptr = src.data<void>();
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
auto dst_ptr = mutable_data(dst_place, src.type());
auto size = src.numel() * sizeof(T);
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@ -165,9 +223,8 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,
#endif
}
template <typename T>
inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
check_memory_size<T>();
check_memory_size();
PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
PADDLE_ENFORCE_LT(begin_idx, end_idx,
@ -182,7 +239,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims);
dst.offset_ = offset_ + begin_idx * base * sizeof(T);
dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
return dst;
}
}
@ -196,10 +253,9 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); }
template <typename T>
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res;
res.ShareDataWith<T>(src);
res.ShareDataWith(src);
res.Resize(flatten_to_2d(src.dims(), num_col_dims));
return res;
}

@ -108,7 +108,7 @@ TEST(Tensor, ShareDataWith) {
// Try to share data form uninitialized tensor
bool caught = false;
try {
dst_tensor.ShareDataWith<float>(src_tensor);
dst_tensor.ShareDataWith(src_tensor);
} catch (paddle::platform::EnforceNotMet err) {
caught = true;
std::string msg =
@ -122,7 +122,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_TRUE(caught);
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
dst_tensor.ShareDataWith<int>(src_tensor);
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
@ -131,7 +131,7 @@ TEST(Tensor, ShareDataWith) {
Tensor src_tensor;
Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
dst_tensor.ShareDataWith<int>(src_tensor);
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#endif
@ -143,7 +143,7 @@ TEST(Tensor, Slice) {
{
Tensor src_tensor;
src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
Tensor slice_tensor = src_tensor.Slice(1, 3);
DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 3);
EXPECT_EQ(slice_dims[0], 2);
@ -167,7 +167,7 @@ TEST(Tensor, Slice) {
{
Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
Tensor slice_tensor = src_tensor.Slice(2, 6);
DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4);
@ -202,7 +202,7 @@ TEST(Tensor, CopyFrom) {
memcpy(src_ptr, arr, 9 * sizeof(int));
auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(src_tensor, *cpu_place, cpu_ctx);
dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr);
@ -210,8 +210,8 @@ TEST(Tensor, CopyFrom) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place, cpu_ctx);
Tensor slice_tensor = src_tensor.Slice(1, 2);
dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr);
@ -233,11 +233,11 @@ TEST(Tensor, CopyFrom) {
// CPU Tensor to GPU Tensor
auto gpu_place = new paddle::platform::GPUPlace(0);
CUDADeviceContext gpu_ctx(*gpu_place);
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place, gpu_ctx);
gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
// GPU Tensor to CPU Tensor
auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors
gpu_ctx.Wait();
@ -247,13 +247,13 @@ TEST(Tensor, CopyFrom) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place, gpu_ctx);
gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
// GPU Tensor to CPU Tensor
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Slice Tensors
gpu_ctx.Wait();
@ -320,7 +320,7 @@ TEST(Tensor, CopyFromVector) {
CUDADeviceContext gpu_ctx(*gpu_place);
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
// Copy from GPU to CPU tensor for comparison
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors
gpu_ctx.Wait();
@ -340,7 +340,7 @@ TEST(Tensor, CopyFromVector) {
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
gpu_tensor.Resize(make_ddim({2, 2}));
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors
gpu_ctx.Wait();
@ -368,7 +368,7 @@ TEST(Tensor, ReshapeToMatrix) {
for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
src_ptr[i] = i;
}
Tensor res = ReshapeToMatrix<int>(src, 2);
Tensor res = ReshapeToMatrix(src, 2);
ASSERT_EQ(res.dims()[0], 2 * 3);
ASSERT_EQ(res.dims()[1], 4 * 9);
}

@ -0,0 +1,58 @@
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
settings(batch_size=16)
channels = get_config_arg("channels", int, 2)
def two_fc(input, group_name):
out1 = fc_layer(input=input,
name=group_name+'_fc1',
size=channels,
bias_attr=False,
act=LinearActivation())
out2 = fc_layer(input=input,
name=group_name+'_fc2',
size=channels,
bias_attr=False,
act=LinearActivation())
return out1, out2
data = data_layer(name ="input", size=channels*16*16)
conv = img_conv_layer(input=data,
num_channels=channels,
filter_size=3,
num_filters=channels,
padding=1,
shared_biases=True,
act=LinearActivation())
pool = img_pool_layer(input=conv,
pool_size=3,
stride=2,
padding=1,
pool_type=AvgPooling())
a1, a2 = two_fc(input=pool, group_name='a')
concat = concat_layer(input=[a1, a2])
b1, b2 = two_fc(input=pool, group_name='b')
addto = addto_layer(input=[b1, b2])
outputs([concat, addto])

@ -0,0 +1,60 @@
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
settings(batch_size=16)
channels = get_config_arg("channels", int, 2)
def two_pool(input, group_name):
out1 = img_pool_layer(input=input,
name=group_name+'_pool1',
pool_size=3,
stride=2,
padding=0,
pool_type=MaxPooling())
out2 = img_pool_layer(input=input,
name=group_name+'_pool2',
pool_size=5,
stride=2,
padding=1,
pool_type=MaxPooling())
return out1, out2
data = data_layer(name ="input", size=channels*16*16)
conv = img_conv_layer(input=data,
num_channels=channels,
filter_size=3,
num_filters=channels,
padding=1,
shared_biases=True,
act=LinearActivation())
pool = img_pool_layer(input=conv,
pool_size=3,
stride=1,
padding=1,
pool_type=AvgPooling())
a1, a2 = two_pool(input=pool, group_name='a')
concat = concat_layer(input=[a1, a2])
b1, b2 = two_pool(input=pool, group_name='b')
addto = addto_layer(input=[b1, b2])
outputs([concat, addto])

@ -250,7 +250,7 @@ TEST(MKLDNNActivation, Activations) {
DECLARE_string(config_args);
TEST(MKLDNNLayer, branches) {
std::vector<std::string> cases = {"conv"};
std::vector<std::string> cases = {"conv", "pool", "fc"};
for (auto name : cases) {
std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf";
for (auto channels : {2, 32}) {

@ -69,5 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`.
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
REGISTER_OP_CPU_KERNEL(accuracy,
ops::AccuracyKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
ops::AccuracyKernel<paddle::platform::CPUPlace, double>,
ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);

@ -21,9 +21,9 @@ namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <int BlockSize>
__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
const int* labeldata, float* accuracy) {
template <typename T, int BlockSize>
__global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata,
const T* labeldata, float* accuracy) {
int count = 0;
__shared__ int total[BlockSize];
@ -57,8 +57,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
auto* accuracy = ctx.Output<Tensor>("Accuracy");
// FIXME(typhoonzero): only support indices currently
// if add support for output values, how to detect the data type?
const int* inference_data = inference->data<int>();
const int* label_data = label->data<int>();
const T* inference_data = inference->data<T>();
const T* label_data = label->data<T>();
float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
size_t num_samples = inference->dims()[0];
@ -69,7 +69,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
return;
}
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
AccuracyCudaKernel<T, PADDLE_CUDA_NUM_THREADS><<<
1, PADDLE_CUDA_NUM_THREADS, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
@ -81,5 +81,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_GPU_KERNEL(accuracy,
paddle::operators::AccuracyOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>,
paddle::operators::AccuracyOpCUDAKernel<int>,
paddle::operators::AccuracyOpCUDAKernel<int64_t>);

@ -108,17 +108,17 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
int in_step = input_channels / groups;
int out_step = output_channels / groups;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice<T>(i, i + 1).Resize(output_matrix_shape);
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
// im2col
Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], strides[1],
paddings[0], paddings[1]);
// gemm
Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step);
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), filter_slice, false,
col_matrix, false, T(1.0), &out_slice, T(0.0));
}
@ -198,22 +198,20 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch =
output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
Tensor in_grad_batch =
input_grad->Slice<T>(i, i + 1).Resize(input_shape);
output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// gemm
Tensor out_grad_slice =
out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
Tensor filter_slice =
filter.Slice<T>(g * out_step, (g + 1) * out_step);
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), filter_slice, true,
out_grad_slice, false, T(1.0), &col_matrix,
T(0.0));
// col2im
Tensor in_grad_slice =
in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step);
in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[1]);
}
@ -229,19 +227,19 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch =
output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// im2col
Tensor out_grad_slice =
out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[1]);
// gemm
Tensor filter_grad_slice =
filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step);
filter_grad_.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), out_grad_slice,
false, col_matrix, true, T(1.0),
&filter_grad_slice, T(1.0));

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save