remove conflict

del_some_in_makelist
chengduoZH 7 years ago
commit 812c5f60eb

@ -8,15 +8,19 @@ function clock_to_seconds() {
}
function infer() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
thread=`nproc`
if [ $thread -gt $bs ]; then
thread=$bs
trainers=`nproc`
if [ $trainers -gt $bs ]; then
trainers=$bs
fi
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
threads=$((`nproc` / trainers))
if [ $threads -eq 0 ]; then
threads=1
fi
export OPENBLAS_NUM_THREADS=$threads
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
@ -28,7 +32,7 @@ function infer() {
--config="${topology}.py" \
--use_mkldnn=False \
--use_gpu=False \
--trainer_count=$thread \
--trainer_count=$trainers \
--log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \

@ -1,7 +1,7 @@
set -e
function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
export OPENBLAS_NUM_THREADS=1
topology=$1
layer_num=$2
bs=$3

@ -252,6 +252,11 @@ first_seq
.. autoclass:: paddle.v2.layer.first_seq
:noindex:
sub_seq
---------
.. autoclass:: paddle.v2.layer.sub_seq
:noindex:
concat
------
.. autoclass:: paddle.v2.layer.concat

@ -68,12 +68,6 @@ scale
:noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose

@ -79,7 +79,7 @@ class Optimizer(object):
def minimize(self, loss, parameter_list):
"""Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward_ops()` and
This method combines interface `append_backward()` and
`create_optimization_pass()` into one.
"""
params_grads = self.create_backward_pass(loss, parameter_list)

@ -15,7 +15,7 @@
获取PaddlePaddle的Docker镜像
------------------------------
执行下面的命令获取最新的PaddlePaddle Docker镜像
执行下面的命令获取最新的PaddlePaddle Docker镜像版本为cpu_avx_mkl
.. code-block:: bash
@ -27,7 +27,7 @@
docker pull docker.paddlepaddle.org/paddle
下载GPU版本的Docker镜像
下载GPU版本cuda8.0_cudnn5_avx_mkl的Docker镜像
.. code-block:: bash
@ -54,7 +54,7 @@
.. _docker_run:
在Docker中执行PaddlePaddle训练程序
------------------------------
----------------------------------
假设您已经在当前目录(比如在/home/work编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
@ -82,7 +82,7 @@
.. _docker_run_book:
使用Docker启动PaddlePaddle Book教程
------------------------------
-----------------------------------
使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook可以通过网页浏览。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。

@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps.
Pull PaddlePaddle Docker Image
------------------------------
Run the following command to download the latest Docker images:
Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
.. code-block:: bash
@ -28,7 +28,7 @@ For users in China, we provide a faster mirror:
docker pull docker.paddlepaddle.org/paddle
Download GPU version images:
Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
.. code-block:: bash
@ -58,7 +58,7 @@ and run:
.. _docker_run:
Launch your training program in Docker
------------------------------
--------------------------------------
Assume that you have already written a PaddlePaddle program
named :code:`train.py` under directory :code:`/home/work` (refer to

@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具
------------------------------
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境并自动下载安装依赖软件。
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境并自动下载安装依赖软件版本为cpu_avx_openblas
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本需要执行
如果需要安装支持GPU的版本cuda7.5_cudnn5_avx_openblas,需要执行:
.. code-block:: bash

@ -12,14 +12,14 @@ Install Using pip
------------------------------
Run the following command to install PaddlePaddle on the current
machine, it will also download requirements.
machine, it will also download requirements, the version is cpu_avx_openblas.
.. code-block:: bash
pip install paddlepaddle
If you wish to install GPU version, just run:
If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
.. code-block:: bash

@ -7,13 +7,13 @@
++++++++
PaddlePaddle支持使用pip快速安装目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12并安装有Python2.7。
执行下面的命令完成快速安装:
执行下面的命令完成快速安装版本为cpu_avx_openblas
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本需要执行
如果需要安装支持GPU的版本cuda7.5_cudnn5_avx_openblas,需要执行:
.. code-block:: bash

@ -8,13 +8,13 @@ Quick Install
You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install:
Simply run the following command to install, the version is cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
If you need to install GPU version, run:
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
.. code-block:: bash

@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
if (WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
endif ()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
if (WITH_GPU)
nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
else()
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/data_transform.h"
#include "paddle/framework/lod_tensor.h"
namespace paddle {
namespace framework {

@ -27,8 +27,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
using DataTransformFN =
std::function<void(const std::vector<platform::DeviceContext*> ctx,
using DataTransformFn = std::function<void(const platform::DeviceContext* ctx,
const Variable& in, Variable* out)>;
using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
@ -47,7 +46,7 @@ struct KernelTypePairHash {
};
using DataTransformMap =
std::unordered_map<KernelTypePair, DataTransformFN, KernelTypePairHash>;
std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
class DataTransformFnMap {
public:
@ -58,25 +57,25 @@ class DataTransformFnMap {
}
void Insert(const OpKernelType& left, const OpKernelType& right,
const DataTransformFN& data_tranform_fn) {
const DataTransformFn& data_tranform_fn) {
Insert(std::make_pair(left, right), data_tranform_fn);
}
void Insert(const KernelTypePair& kernel_type_pair,
const DataTransformFN& data_tranform_fn) {
const DataTransformFn& data_tranform_fn) {
PADDLE_ENFORCE(!Has(kernel_type_pair),
"KernelTypePair %s has been registered", "");
map_.insert({kernel_type_pair, data_tranform_fn});
}
const DataTransformFN& Get(const KernelTypePair& key_pair) const {
const DataTransformFn& Get(const KernelTypePair& key_pair) const {
auto data_transformer = GetNullable(key_pair);
PADDLE_ENFORCE_NOT_NULL(data_transformer,
"DataTransformFN should not be NULL");
"DataTransformFn should not be NULL");
return *data_transformer;
}
const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const {
const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
auto it = map_.find(key_pair);
if (it == map_.end()) {
return nullptr;

@ -11,36 +11,61 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <array>
#include <vector>
#include "paddle/framework/data_transform.h"
#include <gtest/gtest.h>
#include "paddle/framework/data_transform.h"
namespace paddle {
namespace framework {
using namespace platform;
/**
* @brief cross validation of different kernel type transform
* We use four bit map represent different combination.
* If the field has multiple possible value, only choose two of them.
* For DataType, only test the FP32(float), FP64(double).
* e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
* 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
*/
std::array<proto::DataType, 2> kDataType = {
{proto::DataType::FP32, proto::DataType::FP64}};
std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
std::array<DataLayout, 2> kDataLayout = {
{DataLayout::kNHWC, DataLayout::kNCHW}};
std::array<LibraryType, 2> kLibraryType = {
{LibraryType::kPlain, LibraryType::kMKLDNN}};
OpKernelType GenFromBit(const std::vector<bool> bits) {
return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
kLibraryType[bits[3]]);
}
int test_value = 0;
OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN);
OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0),
DataLayout::kNCHW, LibraryType::kCUDNN);
OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0),
DataLayout::kNCHW, LibraryType::kCUDNN);
auto kernel0 = GenFromBit({0, 0, 0, 0});
auto kernel1 = GenFromBit({0, 0, 0, 1});
auto kernel2 = GenFromBit({0, 0, 1, 0});
auto kernel3 = GenFromBit({0, 0, 1, 1});
void type1_to_type2(std::vector<platform::DeviceContext*> ctx,
const Variable& in, Variable* out) {
void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in,
Variable* out) {
test_value++;
}
void type2_to_type3(std::vector<platform::DeviceContext*> ctx,
const Variable& in, Variable* out) {
void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in,
Variable* out) {
test_value--;
}
void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
const Variable& in, Variable* out) {
void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in,
Variable* out) {
test_value += 2;
}
@ -49,12 +74,9 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
namespace frw = paddle::framework;
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2,
frw::type1_to_type2);
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3,
frw::type2_to_type3);
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3,
frw::type1_to_type3);
REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);
TEST(DataTransform, Register) {
using namespace paddle::framework;
@ -62,17 +84,16 @@ TEST(DataTransform, Register) {
auto& instance = DataTransformFnMap::Instance();
ASSERT_EQ(instance.Map().size(), 3UL);
std::vector<DeviceContext*> ctx;
DeviceContext* ctx = nullptr;
paddle::framework::Variable in;
paddle::framework::Variable out;
instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in,
&out);
instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out);
ASSERT_EQ(test_value, 1);
instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in,
&out);
instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out);
ASSERT_EQ(test_value, 0);
instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in,
&out);
instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out);
ASSERT_EQ(test_value, 2);
}

@ -14,18 +14,17 @@ limitations under the License. */
#include "paddle/framework/executor.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <set>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h"
DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
namespace paddle {
namespace framework {
@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
}
}
static void CheckTensorNANOrInf(const std::string& name,
const framework::Tensor& tensor) {
if (tensor.memory_size() == 0) {
return;
}
if (tensor.type().hash_code() != typeid(float).hash_code() &&
tensor.type().hash_code() != typeid(double).hash_code()) {
return;
}
PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) {
// TODO(tonyyang-svail):
@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugString();
op->Run(*local_scope, place_);
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
auto* var = local_scope->FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
}
if (create_local_scope) {
}
}
}
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
}

@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
places.emplace_back(platform::CPUPlace());
LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
}
platform::DeviceContextPool::Create(places);
platform::DeviceContextPool::Init(places);
return true;
}

@ -189,58 +189,12 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
const platform::DeviceContext &dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
{ // the 1st field, uint32_t version for LoDTensor
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char *>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto *pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto *data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto &gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char *>(data_ptr),
static_cast<std::streamsize>(size));
}
}
{ // the 4th field, lod information
{
// the 2st field, LoD information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
static_cast<std::streamsize>(size));
}
}
// the 3st field, Tensor
SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
}
void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
{
// the 1st field, unit32_t version for SelectedRows
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
proto::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void *buf;
platform::Place cpu = platform::CPUPlace();
switch (desc.data_type()) {
case proto::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case proto::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case proto::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case proto::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
is.read(static_cast<char *>(buf), tensor->memory_size());
}
{ // read lod
{
// the 2st field, LoD information
uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod();
@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
lod[i] = tmp;
}
}
// the 3st filed, Tensor
DeserializeFromStream(is, static_cast<Tensor *>(tensor));
}
} // namespace framework

@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) {
EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
}
TEST_F(LoDTensorTester, SerializeAndDeserialize) {
LoDTensor dst_tensor;
platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
std::ostringstream oss;
SerializeToStream(oss, lod_tensor_, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < kLodTensorSize; ++i) {
EXPECT_EQ(dst_ptr[i], i);
}
EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
}
TEST(LodExpand, test) {
LoD lod{{0, 2}};
LoDTensor tensor;

@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
need_update_ = true;
}
void OpDesc::CopyFrom(const OpDesc &op_desc) {
desc_.set_type(op_desc.Type());
inputs_ = op_desc.inputs_;
outputs_ = op_desc.outputs_;
attrs_ = op_desc.attrs_;
need_update_ = true;
}
OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
: desc_(desc), need_update_(false) {
// restore inputs_

@ -35,6 +35,8 @@ class OpDesc {
OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
void CopyFrom(const OpDesc &op_desc);
proto::OpDesc *Proto();
std::string Type() const { return desc_.type(); }

@ -68,6 +68,8 @@ struct OpKernelType {
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
library_type_ == o.library_type_;
}
bool operator!=(const OpKernelType& o) const { return !(*this == o); }
};
inline std::ostream& operator<<(std::ostream& os,
@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os,
return os;
}
inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
std::ostringstream stream;
stream << kernel_key;
return stream.str();
}
} // namespace framework
} // namespace paddle

@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) {
OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN);
std::ostringstream stream;
stream << op_kernel_type;
ASSERT_EQ(
stream.str(),
paddle::framework::KernelTypeToString(op_kernel_type),
"data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
}

@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
const Scope& scope_;
};
const platform::DeviceContext* GetDeviceContext(
framework::KernelTypePair& kernel_pair) {
auto& actual_kernel_key = kernel_pair.first;
auto& expected_kernel_key = kernel_pair.second;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
if (platform::is_gpu_place(actual_kernel_key.place_) &&
platform::is_cpu_place(expected_kernel_key.place_)) {
return pool.Get(actual_kernel_key.place_);
} else if (platform::is_cpu_place(actual_kernel_key.place_) &&
platform::is_gpu_place(expected_kernel_key.place_)) {
return pool.Get(expected_kernel_key.place_);
} else {
PADDLE_THROW(
"Currently, model parallelism is only supported between CPU and CUDA");
}
}
void OperatorWithKernel::Run(const Scope& scope,
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
auto dev_ctx = pool.Borrow(place);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels();
@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope,
}
if (actual_kernel_key == expected_kernel_key) {
kernel_iter->second->Compute(ctx);
PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
"Currently, model parallelism is only supported between "
"CPU and other devices. For example, multi-GPU model "
"parallelism will failed.");
} else {
Scope& op_scope = scope.NewScope();
auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
const DataTransformFn* trans_fun =
DataTransformFnMap::Instance().GetNullable(kernel_pair);
if (trans_fun) {
auto input_vars = this->InputVars();
// TODO(qijun) filter the input vars that do not need to be transformed
// filter vars that has been transformed
std::vector<std::string> need_trans;
for (auto var_name : input_vars) {
op_scope.Var(var_name);
auto var_name_trans =
var_name + framework::KernelTypeToString(expected_kernel_key);
if (!scope.FindVar(var_name_trans)) {
const_cast<Scope&>(scope).Var(var_name_trans);
need_trans.push_back(var_name);
}
}
// TODO(qijun) get appropriate DeviceContext from DeviceContext pool
platform::DeviceContext* trans_dev_ctx = nullptr;
std::vector<platform::DeviceContext*> trans_dev_ctx_vec{trans_dev_ctx};
// TODO(qijun) get appropriate DataTransformFN from global map
framework::DataTransformFN trans_fun = nullptr;
if (!need_trans.empty()) {
auto trans_dev_ctx = GetDeviceContext(kernel_pair);
// Wait for transform starting
dev_ctx->Wait();
for (auto var_name : input_vars) {
trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)),
op_scope.FindVar(var_name));
for (auto var_name : need_trans) {
(*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)),
scope.FindVar(var_name + framework::KernelTypeToString(
expected_kernel_key)));
}
// Wait for data transform finishing
for (auto ctx : trans_dev_ctx_vec) {
ctx->Wait();
trans_dev_ctx->Wait();
}
// Create a new ExecutionContext
ExecutionContext op_ctx(*this, op_scope, *dev_ctx);
kernel_iter->second->Compute(op_ctx);
}
}
kernel_iter->second->Compute(ctx);
}
OpKernelType OperatorWithKernel::GetActualKernelType(

@ -12,5 +12,58 @@ limitations under the License. */
#include "paddle/framework/selected_rows.h"
namespace paddle {
namespace framework {} // namespace framework
namespace framework {
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx) {
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
}
{
// the 2st field, rows information
auto& rows = selected_rows.rows();
uint64_t size = rows.size();
os.write(reinterpret_cast<const char*>(&size), sizeof(size));
for (uint64_t i = 0; i < size; ++i) {
os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
}
}
{
// the 3st field, the height of SelectedRows
int64_t height = selected_rows.height();
os.write(reinterpret_cast<const char*>(&height), sizeof(height));
}
// the 4st field, Tensor data
SerializeToStream(os, selected_rows.value(), dev_ctx);
}
void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
auto tensor = *selected_rows->mutable_value();
{
// the 1st field, unit32_t version for SelectedRows
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
}
{
// the 2st field, rows information
uint64_t size;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
auto& rows = *selected_rows->mutable_rows();
rows.resize(size);
for (uint64_t i = 0; i < size; ++i) {
is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
}
}
{
// the 3st field, the height of the SelectedRows
int64_t height;
is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
selected_rows->set_height(height);
}
// the 4st field, tensor which contains the data
DeserializeFromStream(is, &tensor);
}
} // namespace framework
} // namespace paddle

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save