Merge develop

createGenDocLib
sneaxiy 7 years ago
commit 1083e99520

@ -138,12 +138,6 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release)
endif()
if(WITH_MKL)
option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
if (MKL_SPLIT_GEMM)
add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
endif()
endif()
set(WITH_MKLML ${WITH_MKL})
if (NOT DEFINED WITH_MKLDNN)
if (WITH_MKL AND AVX2_FOUND)

@ -36,19 +36,19 @@
<tbody>
<tr>
<td>OpProtoMake定义 </td>
<td>`.cc`文件Backward Op不需要定义OpProtoMake </td>
<td>.cc 文件Backward Op不需要定义OpProtoMake </td>
</tr>
<tr>
<td>Op定义 </td>
<td> `.cc`文件</td>
<td> .cc 文件</td>
</tr>
<tr>
<td>Kernel实现 </td>
<td> CPU、CUDA共享Kernel实现在`.h`文件中否则CPU 实现在`.cc`文件中CUDA 实现在`.cu`文件中。</td>
<td> CPU、CUDA共享Kernel实现在.h 文件中否则CPU 实现在.cc 文件中CUDA 实现在.cu 文件中。</td>
</tr>
<tr>
<td>注册Op </td>
<td> Op注册实现在`.cc`文件Kernel注册CPU实现在`.cc`文件中CUDA实现在`.cu`文件中</td>
<td> Op注册实现在.cc 文件Kernel注册CPU实现在.cc 文件中CUDA实现在.cu 文件中</td>
</tr>
</tbody>
</table>
@ -391,7 +391,7 @@ PADDLE_ENFORCE(ctx->HasInput("X"), "");
```
问题示例2 :提示信息过于简单
```
PADDLE_ENFORCE(i != nullptr, "I must be set"); // I是什么?
PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么?
```
2. 在报错信息中使用开发人员定义的变量缩写,不易理解!

@ -163,6 +163,7 @@ paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], v
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@ -192,7 +193,7 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)

@ -0,0 +1,48 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace framework {
template <typename T, size_t N>
class Array {
static_assert(N > 0, "The size of array must be larger than 0");
public:
HOSTDEVICE Array() {}
HOSTDEVICE explicit Array(const T &val) {
for (size_t i = 0; i < N; ++i) data_[i] = val;
}
HOSTDEVICE const T *Get() const { return data_; }
HOSTDEVICE T *GetMutable() { return data_; }
HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
HOSTDEVICE constexpr size_t size() const { return N; }
private:
T data_[N];
};
} // namespace framework
} // namespace paddle

@ -763,6 +763,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
// Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const {
// FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
// put them into transpiler.
int op_dev_id = -1;
if (node->Op()->Type() == "send") {
// TODO(paddle-dev): getting the first var is not safe.
@ -771,26 +773,42 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
"This hack no longer holds, please fix.");
// the variable name which contains .block means it was splited by
// split_byref op
// so that we can balance the variable blocks to all the pserver
// instances.
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
node->inputs[0]->Name().find(".block") == std::string::npos) {
std::vector<std::string> input_var_names;
for (ir::Node *n : node->inputs) {
input_var_names.push_back(n->Name());
}
op_dev_id = GetAppropriateDeviceID(input_var_names);
auto send_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
VLOG(10) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id;
for (auto &varname : input_var_names) {
result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(varname, op_dev_id);
}
result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(send_param_grad[1], op_dev_id);
}
} else if (node->Op()->Type() == "recv") {
std::vector<std::string> output_var_names;
for (ir::Node *n : node->outputs) {
output_var_names.push_back(n->Name());
}
op_dev_id = GetAppropriateDeviceID(output_var_names);
auto recv_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
// FIXME(typhoonzero): assume each recv op output one param
// Use the same place as send.
if (recv_param_grad.size() == 2U) {
op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
VLOG(10) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id;
} else {
op_dev_id = GetAppropriateDeviceID(output_var_names);
}
for (auto &varname : output_var_names) {
result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(varname, op_dev_id);

@ -54,7 +54,8 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
<< "\\n"
<< var_handle_ptr->place_ << "\\n"
<< var_handle_ptr->version_ << "\"]" << std::endl;
<< "scope: " << var_handle_ptr->scope_idx_ << "\\n"
<< "v" << var_handle_ptr->version_ << "\"]" << std::endl;
} else if (dummy_ptr) {
sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
}

@ -163,8 +163,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
// 3. Detect op2 -> var2 -> op4
// 4. Detect op2 -> var3 -> op5
// But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
ASSERT_GE(count, 1UL);
ASSERT_LE(count, 2UL);
ASSERT_GE(count, 1);
ASSERT_LE(count, 2);
}
} // namespace ir

@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
namespace ir {
const char Node::kControlDepVarName[] = "__control_var";
constexpr char Node::kControlDepVarName[];
} // namespace ir
} // namespace framework
} // namespace paddle

@ -27,7 +27,7 @@ namespace ir {
class Node {
public:
enum class Type { kOperation, kVariable };
static const char kControlDepVarName[];
static constexpr char kControlDepVarName[] = "__control_var";
explicit Node(const std::string& name, Type type)
: name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}

@ -139,7 +139,7 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
}
auto write_iter = id_to_index_.find(key);
if (write_iter == id_to_index_.end()) {
size_t row_num = rows_.size();
int row_num = rows_.size();
if (row_num == value_->dims()[0]) {
rwlock_->UNLock();
PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
@ -182,7 +182,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
"output tensor should have the same shape with table "
"except the dims[0].");
for (size_t i = 0; i < ids.numel(); ++i) {
for (int i = 0; i < ids.numel(); ++i) {
int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
framework::VisitDataType(
framework::ToDataType(value_->type()),

@ -23,6 +23,8 @@
DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
namespace paddle {
namespace inference {
@ -92,7 +94,7 @@ struct DataRecord {
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
DataRecord(const std::string &path, int batch_size = 1)
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
@ -165,7 +167,6 @@ struct DataRecord {
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
// DataRecord data(FLAGS_datapath, batch_size);
PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
week_tensor, minute_tensor;
lod_attention_tensor.name = "data_lod_attention";
@ -174,28 +175,33 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
week_tensor.name = "week";
minute_tensor.name = "minute";
auto one_batch = data->NextBatch();
// clang-format off
std::vector<int> rnn_link_data_shape
({static_cast<int>(one_batch.rnn_link_data.size()), static_cast<int>(one_batch.rnn_link_data.front().size())});
std::vector<int> rnn_link_data_shape(
{static_cast<int>(one_batch.rnn_link_data.size()),
static_cast<int>(one_batch.rnn_link_data.front().size())});
lod_attention_tensor.shape.assign({1, 2});
lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
init_zero_tensor.shape.assign({batch_size, 15});
init_zero_tensor.lod.assign({one_batch.lod3});
lod_tensor_tensor.shape = rnn_link_data_shape;
lod_tensor_tensor.lod.assign({one_batch.lod1});
week_tensor.shape.assign({(int) one_batch.rnn_week_datas.size(), (int) one_batch.rnn_week_datas.front().size()});
// clang-format off
week_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_week_datas.size()),
static_cast<int>(one_batch.rnn_week_datas.front().size())});
week_tensor.lod.assign({one_batch.lod3});
minute_tensor.shape.assign({(int) one_batch.rnn_minute_datas.size(),
(int) one_batch.rnn_minute_datas.front().size()});
minute_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_minute_datas.size()),
static_cast<int>(one_batch.rnn_minute_datas.front().size())});
minute_tensor.lod.assign({one_batch.lod3});
// clang-format on
// assign data
TensorAssignData(&lod_attention_tensor, std::vector<std::vector<float>>({{0, 0}}));
TensorAssignData(&lod_attention_tensor,
std::vector<std::vector<float>>({{0, 0}}));
std::vector<float> tmp_zeros(batch_size * 15, 0.);
TensorAssignData(&init_zero_tensor, {tmp_zeros});
TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data);
TensorAssignData(&week_tensor, one_batch.rnn_week_datas);
TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas);
// clang-format on
// Set inputs.
auto init_zero_tensor1 = init_zero_tensor;
init_zero_tensor1.name = "hidden_init";
@ -231,12 +237,9 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
os << "\n";
os << " - data: ";
// clang-format off
int dim = std::accumulate(tensor.shape.begin(),
tensor.shape.end(),
1,
[](int a, int b) { return a * b; }); // clang-format on
for (size_t i = 0; i < dim; i++) {
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " ";
}
os << '\n';
@ -300,13 +303,16 @@ void TestDituRNNPrediction(const std::string &model_path,
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
LOG(INFO) << "time/batch: " << timer.toc() / num_times;
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
<< ", latency: " << timer.toc() / num_times << "ms";
LOG(INFO) << "=====================================";
for (auto &out : outputs) {
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
float *data = static_cast<float *>(out.data.data());
for (int i = 0;
for (size_t i = 0;
i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size);
i++) {
EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3);
@ -336,7 +342,7 @@ TEST(Analyzer, SupportIRPass) {
// Directly infer with the original model.
TEST(Analyzer, DituRNN_without_analysis) {
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
10, false, false);
FLAGS_batch_size, false, false, FLAGS_repeat);
}
// Inference with the original model with the analysis turned on, the analysis
@ -344,14 +350,14 @@ TEST(Analyzer, DituRNN_without_analysis) {
TEST(Analyzer, DituRNN_with_analysis) {
LOG(INFO) << "ditu rnn with analysis";
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
10, true, false, 1);
FLAGS_batch_size, true, false, FLAGS_repeat);
}
// Inference with analysis and IR. The IR module will fuse some large kernels.
TEST(Analyzer, DituRNN_with_analysis_with_IR) {
LOG(INFO) << "ditu rnn with analysis and IR fuse";
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
10, true, true, 1);
FLAGS_batch_size, true, true, FLAGS_repeat);
}
} // namespace analysis

File diff suppressed because it is too large Load Diff

@ -0,0 +1,41 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
class AttentionLSTMOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
} // namespace operators
} // namespace paddle

@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
// #include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {

@ -90,6 +90,11 @@ class Blas {
void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
template <typename T>
void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C,
int ldc) const;
#ifdef PADDLE_WITH_MKLML
template <typename T>
T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
@ -109,6 +114,10 @@ class Blas {
void GEMM_FREE(T* data) const;
#endif
template <typename T>
void MatMul(const int M, const int N, const int K, const T* A, const T* B,
T* C) const;
template <typename T>
void MatMul(const framework::Tensor& mat_a, bool trans_a,
const framework::Tensor& mat_b, bool trans_b, T alpha,
@ -140,10 +149,19 @@ class Blas {
template <typename T>
void VCOPY(int n, const T* x, T* y) const;
template <typename T>
void VEXP(int n, const T* x, T* y) const;
template <typename T>
void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
T* C) const;
template <typename T>
T DOT(int n, const T* x, const T* y) const;
template <typename T>
void SCAL(int n, const T a, T* x) const;
template <typename T>
void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T* A, const T* B, T beta, T* C,
@ -215,11 +233,26 @@ class BlasT : private Blas<DeviceContext> {
Base()->template VCOPY<T>(args...);
}
template <typename... ARGS>
void VEXP(ARGS... args) const {
Base()->template VEXP<T>(args...);
}
template <typename... ARGS>
void GEMV(ARGS... args) const {
Base()->template GEMV<T>(args...);
}
template <typename... ARGS>
T DOT(ARGS... args) const {
return Base()->template DOT<T>(args...);
}
template <typename... ARGS>
void SCAL(ARGS... args) const {
Base()->template SCAL<T>(args...);
}
template <typename... ARGS>
void BatchedGEMM(ARGS... args) const {
Base()->template BatchedGEMM<T>(args...);

@ -73,6 +73,16 @@ struct CBlas<float> {
platform::dynload::cblas_sgemv(args...);
}
template <typename... ARGS>
static float DOT(ARGS... args) {
return platform::dynload::cblas_sdot(args...);
}
template <typename... ARGS>
static void SCAL(ARGS... args) {
platform::dynload::cblas_sscal(args...);
}
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
platform::dynload::cblas_sgemm_batch(args...);
@ -87,6 +97,11 @@ struct CBlas<float> {
static void VMUL(ARGS... args) {
platform::dynload::vsMul(args...);
}
template <typename... ARGS>
static void VEXP(ARGS... args) {
platform::dynload::vsExp(args...);
}
};
template <>
@ -138,6 +153,16 @@ struct CBlas<double> {
platform::dynload::cblas_dgemv(args...);
}
template <typename... ARGS>
static double DOT(ARGS... args) {
return platform::dynload::cblas_ddot(args...);
}
template <typename... ARGS>
static void SCAL(ARGS... args) {
platform::dynload::cblas_dscal(args...);
}
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
platform::dynload::cblas_dgemm_batch(args...);
@ -152,6 +177,11 @@ struct CBlas<double> {
static void VMUL(ARGS... args) {
platform::dynload::vdMul(args...);
}
template <typename... ARGS>
static void VEXP(ARGS... args) {
platform::dynload::vdExp(args...);
}
};
#else
@ -210,6 +240,9 @@ struct CBlas<platform::float16> {
PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
}
static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
#ifdef PADDLE_WITH_MKLML
static void GEMM_BATCH(...) {
PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@ -217,64 +250,6 @@ struct CBlas<platform::float16> {
#endif
};
template <typename T>
inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
bool transb, const T &alpha, const T &beta) {
#ifdef PADDLE_WITH_LIBXSMM
// Refer to https://github.com/hfp/libxsmm/blob/master/README.md
// But the threshold is custom
constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
std::abs<T>(alpha - static_cast<T>(1) >
std::numeric_limits<T>::epsilon()) ||
std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
return false;
} else {
return true;
}
#endif
return false;
}
template <>
inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
bool transa, bool transb,
const platform::float16 &alpha,
const platform::float16 &beta) {
return false;
}
template <typename T>
inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
const T *A, int lda, const T *B, int ldb, T beta, T *C,
int ldc) {
#ifdef PADDLE_WITH_LIBXSMM
if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
beta)) {
// Note: SMM use ColMajor
const char transa = 'N';
const char transb = 'N';
CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
&beta, C, &ldc);
return;
}
#endif
#ifdef PADDLE_MKL_SPLIT_GEMM
constexpr int bs = 2;
if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
for (int off = 0; off < M; off += bs) {
CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
}
return;
}
#endif
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
#ifdef PADDLE_WITH_MKLML
template <>
template <typename T>
@ -319,8 +294,8 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
@ -329,9 +304,20 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
int N, int K, T alpha, const T *A,
int lda, const T *B, int ldb,
T beta, T *C, int ldc) const {
GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M,
int N, int K, T alpha, const T *A,
int lda, const T *B, int ldb,
T beta, T *C, int ldc) const {
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <typename DeviceContext>
@ -399,6 +385,47 @@ void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
#endif
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VEXP(n, x, y);
#else
// try to find if openblas support vexp
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
#endif
}
template <>
template <typename T>
T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
#ifdef PADDLE_WITH_MKLML
return CBlas<T>::DOT(n, x, 1, y, 1);
#else
// try to find if openblas support cblas_dot
T sum = 0;
for (int i = 0; i < n; ++i) {
sum += x[i] * y[i];
}
return sum;
#endif
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::SCAL(n, a, x, 1);
#else
// try to find if openblas support cblas_scal
for (int i = 0; i < n; ++i) {
x[i] = a * x[i];
}
#endif
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
@ -440,6 +467,42 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
#endif
}
template <typename DeviceContext>
template <typename T>
void Blas<DeviceContext>::MatMul(const int M, const int N, const int K,
const T *A, const T *B, T *C) const {
this->template GEMM<T>(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
static_cast<T>(1), A, K, B, N, static_cast<T>(0), C,
N);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
const int K, const T *A,
const T *B, T *C) const {
#ifdef PADDLE_WITH_LIBXSMM
// Refer to https://github.com/hfp/libxsmm/blob/master/README.md
// But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
// Since the matrix is very small,
// so the unit of calculation is already very fast,
// and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
// use xsmm directly.
// Note: SMM use ColMajor
const char transa = 'N';
const char transb = 'N';
const T alpha = static_cast<T>(1);
const T beta = static_cast<T>(0);
CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
C, &N);
return;
#endif
CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
static_cast<T>(1), A, K, B, N, static_cast<T>(0), C, N);
}
template <typename DeviceContext>
template <typename T>
void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,

@ -0,0 +1,105 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace math {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
template <typename T>
inline T sigmoid(T x) {
return 1. / (1. + exp(-x));
}
template <typename T>
inline T tanh(T x) {
return 2. * sigmoid(2. * x) - 1.;
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_identity(const int n, const T* x, T* y) {
// do nothing
return;
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_sigmoid(const int n, const T* x, T* y) {
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int i = 0; i < n; ++i) {
T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
y[i] = 1.0 / (1.0 + std::exp(-tmp));
}
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_tanh(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = tanh<T>(x[i]);
}
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_relu(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
template <>
inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
float* y) {
// TODO(TJ): complete me
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
template <>
inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
float* y) {
// TODO(TJ): complete me
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
class VecActivations {
public:
std::function<void(const int, const T*, T*)> operator()(
const std::string& type) {
if (type == "sigmoid") {
return vec_sigmoid<T, isa>;
} else if (type == "relu") {
return vec_relu<T, isa>;
} else if (type == "tanh") {
return vec_tanh<T, isa>;
} else if (type == "identity" || type == "") {
return vec_identity<T, isa>;
}
PADDLE_THROW("Not support type %s.", type);
}
};
} // namespace math
} // namespace operators
} // namespace paddle

@ -25,17 +25,25 @@ namespace math {
template <typename DeviceContext, typename T>
inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
const int N, const int K, const T* X, const T* W, T* Y,
const T* B = NULL) {
blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), X, W,
static_cast<T>(0), Y);
if (B) {
const T* B = NULL, bool relu = false) {
blas.MatMul(M, N, K, X, W, Y);
if (B == NULL) {
return;
}
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for (int i = 0; i < M; i++) {
blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
}
for (int i = 0; i < M; i++) {
blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
}
if (!relu) {
return;
}
// TODO(TJ): fuse relu
LOG(FATAL) << "Not implemented!";
}
} // namespace math

@ -54,7 +54,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
static_cast<T>(context.Attr<float>("max")));
std::vector<T> ids(batch_size);
for (size_t i = 0; i < batch_size; ++i) {
for (int i = 0; i < batch_size; ++i) {
T r = dist(engine);
int idx = width - 1;
for (int j = 0; j < width; ++j) {
@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
break;
}
}
ids[i] = ins_vector[i * width + idx];
ids[i] = ins_vector[idx];
}
std::vector<int64_t> out_dim;

@ -0,0 +1,28 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/stack_op.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators;
REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
ops::StackGradOpDescMaker);
REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
ops::StackKernel<plat::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(stack_grad,
ops::StackGradKernel<plat::CPUDeviceContext, float>,
ops::StackGradKernel<plat::CPUDeviceContext, double>);

@ -0,0 +1,25 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/stack_op.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
ops::StackKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(stack_grad,
ops::StackGradKernel<plat::CUDADeviceContext, float>,
ops::StackGradKernel<plat::CUDADeviceContext, double>);

File diff suppressed because it is too large Load Diff

@ -58,11 +58,15 @@ class WhileOp : public framework::OperatorBase {
PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
"Condition of while op must in CPU memory.");
bool is_test = Attr<bool>("is_test");
auto ctx = executor.Prepare(*program, block->ID());
while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope);
executor.RunPreparedContext(ctx.get(), &current_scope, false);
if (is_test) {
scope.DeleteScope(&current_scope);
}
}
}
};
@ -88,6 +92,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
"variables generated in the i'th step.");
AddAttr<framework::BlockDesc *>(kStepBlock,
"The step block inside WhileOp");
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddComment(R"DOC(
)DOC");
}
@ -103,6 +108,8 @@ class WhileGradOp : public framework::OperatorBase {
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
PADDLE_ENFORCE(!Attr<bool>("is_test"),
"GradOp is only callable when is_test is false");
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);

@ -103,15 +103,16 @@ size_t CUDAPinnedMaxChunkSize() {
return CUDAPinnedMaxAllocSize() / 256;
}
#ifdef PADDLE_WITH_XBYAK
namespace jit {
#ifdef PADDLE_WITH_XBYAK
static Xbyak::util::Cpu cpu;
bool MayIUse(const cpu_isa_t cpu_isa) {
using namespace Xbyak::util; // NOLINT
switch (cpu_isa) {
case sse42:
return cpu.has(Cpu::tSSE42);
case avx:
return cpu.has(Cpu::tAVX);
case avx2:
return cpu.has(Cpu::tAVX2);
case avx512_common:
@ -134,8 +135,16 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
}
return false;
}
#else
bool MayIUse(const cpu_isa_t cpu_isa) {
if (cpu_isa == isa_any) {
return true;
} else {
return false;
}
}
#endif
} // namespace jit
#endif
} // namespace platform
} // namespace paddle

@ -37,12 +37,11 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
#ifdef PADDLE_WITH_XBYAK
namespace jit {
typedef enum {
isa_any,
sse42,
avx,
avx2,
avx512_common,
avx512_core,
@ -55,7 +54,6 @@ typedef enum {
inline bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace jit
#endif
} // namespace platform
} // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save