Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dist-lookup-table-only-support-sgd

revert-12646-feature/jit/xbyak
qiaolongfei 7 years ago
commit fd53fdf84b

@ -27,15 +27,6 @@ script:
# 43min timeout
paddle/scripts/paddle_docker_build.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
- |
if [[ "$JOB" != "doc" ]]; then exit 0; fi;
# For document only
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export DOCS_DIR=`pwd`
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
notifications:
email:
on_success: change

@ -265,6 +265,7 @@ function(cc_test TARGET_NAME)
if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endif()
endfunction(cc_test)
@ -330,6 +331,7 @@ function(nv_test TARGET_NAME)
if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endif()
endfunction(nv_test)
@ -577,7 +579,8 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()

@ -1,21 +1,27 @@
# 如何使用timeline工具做性能分析
1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
**提示:**
请不要在timeline记录信息时运行太多次迭代因为timeline中的记录数量和迭代次数是成正比的。
```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
if pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile")
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
...
```
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
```python
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
```
1. 打开chrome浏览器访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。

@ -1,15 +1,17 @@
# how to use timeline tool to do profile
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[],
use_program_cache=True)
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
if pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile")
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
...
```
@ -17,6 +19,10 @@
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
```python
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
```
1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
![chrome tracing](./tracing.jpeg)

@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent r("all_reduce", nullptr);
if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1;
} else {

@ -16,12 +16,14 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
void ReduceOpHandle::RunImpl() {
platform::RecordEvent r("reduce", nullptr);
if (places_.size() == 1) return;
// the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_);

@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception();
}
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1;
if (!fetch_tensors.empty() ||
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {

@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
BlockingQueue<VarHandleBase *> ready_vars;
@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context
run_op_futures_.clear();
exception_holder_.Clear();
event.reset(nullptr);
// Step 3. Execution
while (!pending_vars.empty()) {

@ -136,6 +136,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id);
#endif
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
VLOG(10) << "+ " << DebugStringEx(&scope);
}
@ -639,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// For profiling, don't move out of this function because that will result
// in the failure of multi-GPU profiling.
platform::RecordEvent record_event(Type(), dev_ctx);
// check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_);

@ -74,9 +74,10 @@ if (WITH_ANAKIN) # only needed in CI
target_link_libraries(inference_anakin_api anakin anakin_saber_common)
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
if (WITH_TESTING)
cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
# this test is unstable, disable it first.
#cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
#ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
#DEPS inference_anakin_api_shared)
#target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING)
endif()

@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
const platform::Place &place) const override {
// get device context from pool
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
platform::RecordEvent record_event(Type(), dev_ctx);
auto feed_var_name = Input("X");
auto *feed_var = scope.FindVar(feed_var_name);

@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();

@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
auto fetch_var_name = Input("X");
auto *fetch_var = scope.FindVar(fetch_var_name);
PADDLE_ENFORCE(fetch_var != nullptr,

@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
platform::RecordEvent record_event(Type(), dev_ctx);
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
auto filename = Attr<std::string>("file_path");

@ -32,11 +32,16 @@ class LookupTableOp : public framework::OperatorWithKernel {
auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids");
int ids_rank = ids_dims.size();
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
PADDLE_ENFORCE_EQ(table_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
"The last dimension of the 'Ids' tensor must be 1.");
ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
auto output_dims =
framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
output_dims.push_back(table_dims[1]);
ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
if (ctx->GetOutputsVarType("Out")[0] ==
framework::proto::VarType::LOD_TENSOR) {
@ -61,8 +66,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Ids",
"An input with type int32 or int64 "
"contains the ids to be looked up in W. "
"Ids must be a column vector with rank = 2. "
"The 2nd dimension size must be 1.");
"The last dimension size must be 1.");
AddOutput("Out", "The lookup results, which have the same type as W.");
AddAttr<bool>("is_sparse",
"(boolean, default false) "

@ -118,28 +118,31 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>();
auto ids_dim = ids->dims();
int64_t ids_num = ids->numel();
auto stream = dev_ctx.stream();
// copy GPU memory to CPU pinned memory
framework::Vector<int64_t> new_rows;
new_rows.resize(ids_dim[0]);
new_rows.resize(ids_num);
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
// TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
ids_data, ids_dim[0] * sizeof(int64_t), stream);
ids_data, ids_num * sizeof(int64_t), stream);
d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_dim[0], table->dims()[1]});
d_table_value->Resize({ids_num, table->dims()[1]});
d_table_value->mutable_data<T>(context.GetPlace());
auto *d_table_data = d_table_value->data<T>();
auto *d_output_data = d_output->data<T>();
PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
auto d_output_dims = d_output->dims();
PADDLE_ENFORCE_EQ(
d_table_value->dims(),
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
d_output->numel() * sizeof(T), stream);

@ -109,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>();
auto ids_dim = ids->dims();
int64_t ids_num = ids->numel();
framework::Vector<int64_t> new_rows;
new_rows.reserve(ids_dim[0]);
for (int64_t i = 0; i < ids_dim[0]; i++) {
new_rows.reserve(ids_num);
for (int64_t i = 0; i < ids_num; i++) {
new_rows.push_back(ids_data[i]);
}
d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_dim[0], table_dim[1]});
d_table_value->Resize({ids_num, table_dim[1]});
d_table_value->mutable_data<T>(context.GetPlace());
d_table->set_height(table_dim[0]);
@ -127,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *d_output_data = d_output->data<T>();
auto *d_table_data = d_table_value->data<T>();
PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
auto d_output_dims = d_output->dims();
PADDLE_ENFORCE_EQ(
d_table_value->dims(),
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
} else {
auto *ids = context.Input<LoDTensor>("Ids");
@ -135,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>();
auto ids_dim = ids->dims();
int N = table_dim[0];
int D = d_output->dims()[1];
int D = table_dim[1];
auto *d_output_data = d_output->data<T>();
auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());

@ -18,7 +18,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase {
workers.emplace_back(
framework::Async([program, cur_scope, place, block, place_idx] {
// Give the thread an id to distinguish parallel block with same id.
platform::RecordThread rt(static_cast<int>(place_idx) + 1);
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase {
// execute
workers.emplace_back(
framework::Async([program, cur_scope, place, block, i] {
// Give the thread an id to distinguish parallel block with same id.
platform::RecordThread rt(static_cast<int>(i) + 1);
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);

@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();

@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
bool sync_mode = Attr<bool>("sync_mode");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();

@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();

@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
// allocate memory on device.
Out->mutable_data<T>(context.GetPlace());
auto dims = X->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_x;
framework::LoDTensor flattened_out;
flattened_x.ShareDataWith(*X).Resize(flattened_dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
math::SoftmaxCUDNNFunctor<T>()(
context.template device_context<platform::CUDADeviceContext>(), X, Out);
context.template device_context<platform::CUDADeviceContext>(),
&flattened_x, &flattened_out);
}
};
@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
// allocate memory on device.
dX->mutable_data<T>(context.GetPlace());
auto dims = Out->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_out;
framework::LoDTensor flattened_d_out;
framework::LoDTensor flattened_d_x;
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
math::SoftmaxGradCUDNNFunctor<T>()(
context.template device_context<platform::CUDADeviceContext>(), Out,
dOut, dX);
context.template device_context<platform::CUDADeviceContext>(),
&flattened_out, &flattened_d_out, &flattened_d_x);
}
};

@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory; // Note: paddle has also "memory" namespace
using mkldnn::primitive;
using mkldnn::softmax_forward;
using mkldnn::softmax_backward;
using mkldnn::prop_kind;
using mkldnn::softmax_backward;
using mkldnn::softmax_forward;
using mkldnn::stream;
using platform::to_void_cast;
@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
auto mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("X");
Tensor* output = ctx.Output<Tensor>("Out");
PADDLE_ENFORCE(input->dims().size() == 2UL,
"The input of softmax op must be a 2D matrix.");
const T* input_data = input->data<T>();
// allocate memory for output
T* output_data = output->mutable_data<T>(ctx.GetPlace());
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// MKL-DNN does support softmax over selected axis. Having 2D Tensor,
// we will make normalization after final eg. axis: 1
PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
"Softmax input and output dimensions should match");
PADDLE_ENFORCE_EQ(
input->dims(), output->dims(),
"The shape of softmax's input and output must be identical.");
// make sure 'output' holds memory, which will be shared by
// 'flattened_output' later.
output->mutable_data<T>(ctx.GetPlace());
// flatten input and output to 2-D matrixs
auto dims = input->dims(); // input and output share the same shape
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::Tensor flattened_input;
framework::Tensor flattened_output;
flattened_input.ShareDataWith(*input).Resize(flattened_dims);
flattened_output.ShareDataWith(*output).Resize(flattened_dims);
const T* input_data = flattened_input.data<T>();
T* output_data = flattened_output.mutable_data<T>(ctx.GetPlace());
std::vector<int> src_tz = paddle::framework::vectorize2int(flattened_dims);
std::vector<int> dst_tz = src_tz;
// Same memory descriptor to be used for input and output
memory::dims softmax_tz = {src_tz[0], src_tz[1]};
// Generate keys for storing/retriving primitives for this operator
@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine();
const Tensor* output = ctx.Input<Tensor>("Out");
const T* dst_data = output->data<T>();
auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
const auto* diff_dst_ptr = dout->template data<T>();
auto* dx =
ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
PADDLE_ENFORCE_EQ(
dout->dims(), dx->dims(),
"The shape of softmax_grad's input and output must be identical.");
// make sure 'dx' holds memory, which will be shared by 'flattened_dx'
// later.
dx->template mutable_data<T>(ctx.GetPlace());
auto dims = dout->dims(); // input and output share the same shape
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::Tensor flattened_output;
framework::Tensor flattened_dout;
framework::Tensor flattened_dx;
flattened_output.ShareDataWith(*output).Resize(flattened_dims);
flattened_dout.ShareDataWith(*dout).Resize(flattened_dims);
flattened_dx.ShareDataWith(*dx).Resize(flattened_dims);
const T* dst_data = flattened_output.data<T>();
const T* diff_dst_ptr = flattened_dout.template data<T>();
T* diff_src_ptr = flattened_dx.template mutable_data<T>(ctx.GetPlace());
std::vector<int> dst_tz = paddle::framework::vectorize2int(flattened_dims);
std::vector<int> src_tz(dst_tz);
PADDLE_ENFORCE(output->dims().size() == 2UL,
"The input of softmax op must be a 2D matrix.");
// MKL-DNN does support softmax over selected axis. Having 2D Tensor,
// we will make normalization after final eg. axis: 1
PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
"Softmax input and output dimensions should match");
// Same memory descriptor to be used for input and output
memory::dims softmax_tz = {src_tz[0], src_tz[1]};
// Currently only supports NC data format

@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SoftmaxOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(x_dims.size() == 2UL,
"The input of softmax op must be a matrix.");
ctx->SetOutputDim("Out", x_dims);
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor of softmax. "
"2-D with shape [batch_size, input_feature_dimensions].");
"The input tensor of softmax, "
"whose last dimension is the input_feature_dimensions.");
AddOutput("Out", "The normalized values with the same shape as X.")
.Reuse("X");
AddAttr<bool>(
@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
Softmax Operator.
The input of the softmax operator is a 2-D tensor with shape N x K (N is the
batch_size, K is the dimension of input feature). The output tensor has the
same shape as the input tensor.
The input of the softmax operator is a tensor of any rank. The output tensor
has the same shape as the input.
For each row of the input tensor, the softmax operator squashes the
K-dimensional vector of arbitrary real values to a K-dimensional vector of real
values in the range [0, 1] that add up to 1.
The input tensor will first be logically flattened to a 2-D matrix. The matrix's
second dimension(row length) is as same as the last dimension of the input
tensor, and the first dimension(column length) is the product of all other
dimensions of the input tensor. For each row of the matrix, the softmax operator
squashes the K-dimensional(K is the width of the matrix, which is also the size
of the input tensor's last dimension) vector of arbitrary real values to a
K-dimensional vector of real values in the range [0, 1] that add up to 1.
It computes the exponential of the given dimension and the sum of exponential
values of all the other dimensions in the K-dimensional vector input.
Then the ratio of the exponential of the given dimension and the sum of
exponential values of all the other dimensions is the output of the softmax
operator.
For each row $i$ and each column $j$ in Input(X), we have:
For each row $i$ and each column $j$ in the matrix, we have:
$$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
)DOC");

@ -31,8 +31,16 @@ class SoftmaxKernel : public framework::OpKernel<T> {
// allocate memory on device.
Out->mutable_data<T>(context.GetPlace());
auto dims = X->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_x;
framework::LoDTensor flattened_out;
flattened_x.ShareDataWith(*X).Resize(flattened_dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
math::SoftmaxFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), X, Out);
context.template device_context<DeviceContext>(), &flattened_x,
&flattened_out);
}
};
@ -47,8 +55,18 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
// allocate memory on device.
dX->mutable_data<T>(context.GetPlace());
auto dims = Out->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_out;
framework::LoDTensor flattened_d_out;
framework::LoDTensor flattened_d_x;
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
math::SoftmaxGradFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), Out, dOut, dX);
context.template device_context<DeviceContext>(), &flattened_out,
&flattened_d_out, &flattened_d_x);
}
};

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save